[python]爬取UE 蓝图文档,并转为markdown

python
5月2日
编辑

月离离

源码

文章有点多, 爬的时候可能被限制, 偷偷加个代理再爬就行

import os
import asyncio
from pathlib import Path

import aiohttp
from bs4 import BeautifulSoup
from pydantic import BaseModel
from markdownify import markdownify as md

class Item(BaseModel):
    name: str
    href: str

class Document(BaseModel):
    name: str
    url: str
    content: str = ""
    filepath: Path = Path()

# 获取网页内容
async def fetchJson(session: aiohttp.ClientSession, url: str):

    headers = {
        "User-Agent": "Your User Agent",
        "Accept": "application/json",
    }

    # proxy = "http://127.0.0.1:10809"

    async with session.get(
        url,
        headers=headers,
    ) as response:
        if response.status == 200:
            data = await response.json()
            return data

# 解析分类与章节
async def parseNodes(session: aiohttp.ClientSession, url: str):

    if data := await fetchJson(session, url):
        html = data["blocks"][0]["content_html"]
        soup = BeautifulSoup(html, "html.parser")
        node = soup.find_all("block-dir-item")
        items: list[Item] = []
        for child in node:
            name = child["description"]
            href = child["href"]
            items.append(Item(name=name, href=href))

        return items

    return []

# 解析文章
async def parseDocument(session: aiohttp.ClientSession, url: str, document: Document):
    if data := await fetchJson(session, url):
        document.content = data["blocks"][0]["content_html"]

# 写入文件
async def writeMd(docs: Document):
    with open(docs.filepath, "w+", encoding="utf-8") as f:
        f.write(md(docs.content))

# 转换一下链接
def get_api_href(href: str):
    api_base = (
        f"https://dev.epicgames.com/community/api/documentation/document.json?path=en-us/unreal-engine/BlueprintAPI/"
    )
    query = href.replace("https://dev.epicgames.com/documentation/en-us/unreal-engine/BlueprintAPI/", "")
    return api_base + query

# 遍历章节
async def fetchChapter(session: aiohttp.ClientSession, url: str, chapter: str):

    if documents_data := await parseNodes(session=session, url=url):

        task_fetch = []
        task_write_file = []
        for document in documents_data:

            dc = Document(name=document.name, url=document.href)
            file_path = Path("dist", chapter, document.name + ".md")
            file_size = os.path.getsize(file_path)

            # 防止第一次爬不完
            if file_size > 0:
                continue

            if not file_path.parent.exists():
                os.makedirs(file_path.parent)

            dc.filepath = file_path

            api_url = get_api_href(document.href)

            task_fetch.append(parseDocument(session, api_url, dc))
            task_write_file.append(writeMd(dc))

            # 限制并发
            if len(task_fetch) >= 20:
                await asyncio.gather(*task_fetch)
                await asyncio.gather(*task_write_file)
                task_fetch = []
                task_write_file = []

        await asyncio.gather(*task_fetch)
        await asyncio.gather(*task_write_file)

        # writeMd(file_path, document_content)

async def main():
    version = "5.4"
    lang = "en-us"
    url = f"https://dev.epicgames.com/community/api/documentation/document.json?path={lang}/unreal-engine/BlueprintAPI&application_version={version}"

    async with aiohttp.ClientSession() as session:
        chapters = await parseNodes(session, url)

        if not chapters:
            print("哥们 慢点爬")
            return

        task = []
        for chapter in chapters:
            api_url = get_api_href(chapter.href)
            task.append(fetchChapter(session, api_url, chapter.name))

        await asyncio.gather(*task)

if __name__ == "__main__":
    asyncio.run(main())

{{userData.name}}已认证

[python]爬取UE 蓝图文档,并转为markdown

源码

[python]如何扒取houdini vex文档并保存为markdown

【转场周期表】Basic Circle

【转场周期表】Type Writer

006.3D Compositing（三维合成

【动效周期表】Sin 三角函数

【脚本案例】基于选择的item创建相同合成，添加效果

【动效周期表】Rhythm 节奏

【Blender】如何使用UV网格或颜色网格

Ps ToolBox

【AE脚本】位置属性绑定空对象

【AE脚本】- NewLayer 干掉一堆一模一样却只用一次的空对象、纯色层

【Shape Connector】AE一键制作变形动画

【限时8折】【Psd Updater】AE一键同步PSD新增图层

【AE脚本】- ShapeSelect 形状图层属性给我展开！

月离文档站

月离网盘

月离导航站

关注我们

法律声明

在线工单

查看帮助

{{userData.name}}已认证

源码

[python]如何扒取houdini vex文档并保存为markdown

【转场周期表】Basic Circle

Ps ToolBox

【AE脚本】位置属性绑定空对象

【AE脚本】- NewLayer 干掉一堆一模一样却只用一次的空对象、纯色层

【Shape Connector】AE一键制作变形动画

【限时8折】【Psd Updater】AE一键同步PSD新增图层

【AE脚本】- ShapeSelect 形 状 图 层 属 性 给 我 展 开 ！

相似站点

月离文档站

月离网盘

月离导航站

关注我们

法律声明

在线工单

查看帮助

【AE脚本】- ShapeSelect 形状图层属性给我展开！