[python]爬取网站内容并转为md

编程
22年8月31日
编辑

月离离

import os
import re
import html2text as ht
import requests
from bs4 import BeautifulSoup

def addTitle(root_dir, ext=None):
    """
    :param root_dir: 文件夹
    :param ext: 筛选后缀
    :return: 无
    filter: 过滤.开头文件  过滤readme 筛选md文件
    """
    fullNames_list = []
    paths_list = []
    i = 0
    regex = "[0-9]{1,3}"
    for parent, _, fileNames in os.walk(root_dir):
        # 文件排序
        fileNames.sort(key=lambda x: int(re.search(regex, x).group(0) if re.search(regex, x) else 0))  # 文件名 按数字排序

        for fullName in fileNames:
            # 过滤
            if fullName.startswith('.'):  # 去除隐藏文件
                continue
            if fullName.lower().startswith('readme'):
                continue

            if ext:  # 根据后缀名搜索
                if fullName.endswith(tuple(ext)):
                    i+=1
                    name = fullName.rsplit(".",1)[0]
                    content = f"""---
title: {name}
order: {i}
category:
  - AE表达式
---
"""
                    filePath = os.path.join(parent, fullName)
                    with open(filePath,"r+", encoding='utf-8') as f:
                        ff = f.read()
                        f.seek(0)
                        f.truncate()
                        f.write(content + ff)

            else:
                ...

def getCat(URL):
    ...
    names = []
    links = []

    htmlfile = requests.get(URL)
    htmlfile.encoding = 'utf-8'
    soup = BeautifulSoup(htmlfile.text, 'html.parser')
    cats = soup.findAll(class_='docs-single-cat-wrap')
    for i in cats:
        print(i.text)
        link = i.findAll("a")
        print(str(link))
        # os.mkdir(i.text)
        # print (i.text)

def getContent(url):
    """
    :param url:
    :return: text
    """

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    path = "E:\Project\docs_yuelili_com\docs\zh\exp"
    # addTitle(path,"md")

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

URL = "https://www.yuelili.com/docs/ae-effect/arithmetic/"

getCat(URL)
# text_maker = ht.HTML2Text()
# text_maker.bypass_tables = False
# #
# htmlfile = requests.get(URL)
# htmlfile.encoding = 'utf-8'
# soup = BeautifulSoup(htmlfile.text,'html.parser')
# content = str(soup.find(class_='betterdocs-content'))
#
# text = text_maker.handle(content)
#
#
# with open("12121.md","w+",encoding="utf-8") as f:
#     f.write(text)

{{userData.name}}已认证

[python]爬取网站内容并转为md

[HTML进阶]页面渲染顺序

[前端]夜间模式简单实现

【AE脚本】Flow – 关键帧曲线调节

【python】提取C4D中英日对照

【设计】[排版#1] 排版知识:视觉动线 -oooooohmygosh

【AE脚本】文字图层与文字的一些操作

『一分钟动画』AE回忆类的心跳曲线

【AE脚本】QTMD混合模式

Ps ToolBox

【AE脚本】位置属性绑定空对象

【AE脚本】- NewLayer 干掉一堆一模一样却只用一次的空对象、纯色层

【Shape Connector】AE一键制作变形动画

【限时8折】【Psd Updater】AE一键同步PSD新增图层

【AE脚本】- ShapeSelect 形状图层属性给我展开！

月离文档站

月离云盘

月离导航站

链接缩短

关注我们

法律声明

在线工单

查看帮助

{{userData.name}}已认证

[HTML进阶]页面渲染顺序

[前端]夜间模式简单实现

Ps ToolBox

【AE脚本】位置属性绑定空对象

【AE脚本】- NewLayer 干掉一堆一模一样却只用一次的空对象、纯色层

【Shape Connector】AE一键制作变形动画

【限时8折】【Psd Updater】AE一键同步PSD新增图层

【AE脚本】- ShapeSelect 形 状 图 层 属 性 给 我 展 开 ！

相似站点

月离文档站

月离云盘

月离导航站

链接缩短

关注我们

法律声明

在线工单

查看帮助

【AE脚本】- ShapeSelect 形状图层属性给我展开！