[python]爬取UE 蓝图文档,并转为markdown

源码

文章有点多, 爬的时候可能被限制, 偷偷加个代理再爬就行

import os
import asyncio
from pathlib import Path

import aiohttp
from bs4 import BeautifulSoup
from pydantic import BaseModel
from markdownify import markdownify as md

class Item(BaseModel):
    name: str
    href: str

class Document(BaseModel):
    name: str
    url: str
    content: str = ""
    filepath: Path = Path()

# 获取网页内容
async def fetchJson(session: aiohttp.ClientSession, url: str):

    headers = {
        "User-Agent": "Your User Agent",
        "Accept": "application/json",
    }

    # proxy = "http://127.0.0.1:10809"

    async with session.get(
        url,
        headers=headers,
    ) as response:
        if response.status == 200:
            data = await response.json()
            return data

# 解析分类与章节
async def parseNodes(session: aiohttp.ClientSession, url: str):

    if data := await fetchJson(session, url):
        html = data["blocks"][0]["content_html"]
        soup = BeautifulSoup(html, "html.parser")
        node = soup.find_all("block-dir-item")
        items: list[Item] = []
        for child in node:
            name = child["description"]
            href = child["href"]
            items.append(Item(name=name, href=href))

        return items

    return []

# 解析文章
async def parseDocument(session: aiohttp.ClientSession, url: str, document: Document):
    if data := await fetchJson(session, url):
        document.content = data["blocks"][0]["content_html"]

# 写入文件
async def writeMd(docs: Document):
    with open(docs.filepath, "w+", encoding="utf-8") as f:
        f.write(md(docs.content))

# 转换一下链接
def get_api_href(href: str):
    api_base = (
        f"https://dev.epicgames.com/community/api/documentation/document.json?path=en-us/unreal-engine/BlueprintAPI/"
    )
    query = href.replace("https://dev.epicgames.com/documentation/en-us/unreal-engine/BlueprintAPI/", "")
    return api_base + query

# 遍历章节
async def fetchChapter(session: aiohttp.ClientSession, url: str, chapter: str):

    if documents_data := await parseNodes(session=session, url=url):

        task_fetch = []
        task_write_file = []
        for document in documents_data:

            dc = Document(name=document.name, url=document.href)
            file_path = Path("dist", chapter, document.name + ".md")
            file_size = os.path.getsize(file_path)

            # 防止第一次爬不完
            if file_size > 0:
                continue

            if not file_path.parent.exists():
                os.makedirs(file_path.parent)

            dc.filepath = file_path

            api_url = get_api_href(document.href)

            task_fetch.append(parseDocument(session, api_url, dc))
            task_write_file.append(writeMd(dc))

            # 限制并发
            if len(task_fetch) >= 20:
                await asyncio.gather(*task_fetch)
                await asyncio.gather(*task_write_file)
                task_fetch = []
                task_write_file = []

        await asyncio.gather(*task_fetch)
        await asyncio.gather(*task_write_file)

        # writeMd(file_path, document_content)

async def main():
    version = "5.4"
    lang = "en-us"
    url = f"https://dev.epicgames.com/community/api/documentation/document.json?path={lang}/unreal-engine/BlueprintAPI&application_version={version}"

    async with aiohttp.ClientSession() as session:
        chapters = await parseNodes(session, url)

        if not chapters:
            print("哥们 慢点爬")
            return

        task = []
        for chapter in chapters:
            api_url = get_api_href(chapter.href)
            task.append(fetchChapter(session, api_url, chapter.name))

        await asyncio.gather(*task)

if __name__ == "__main__":
    asyncio.run(main())

给TA充电
共{{data.count}}人
人已充电
python编程

[python]如何扒取houdini vex文档并保存为markdown

2024-5-2 18:39:56

归档

【转场周期表】Basic Circle

2022-12-12 1:17:47

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧
个人中心
今日签到
搜索