[python]爬取网站内容 并转为md

import os
import re
import html2text as ht
import requests
from bs4 import BeautifulSoup

def addTitle(root_dir, ext=None):
    """
    :param root_dir: 文件夹
    :param ext: 筛选后缀
    :return: 无
    filter: 过滤.开头文件  过滤readme 筛选md文件
    """
    fullNames_list = []
    paths_list = []
    i = 0
    regex = "[0-9]{1,3}"
    for parent, _, fileNames in os.walk(root_dir):
        # 文件排序
        fileNames.sort(key=lambda x: int(re.search(regex, x).group(0) if re.search(regex, x) else 0))  # 文件名 按数字排序

        for fullName in fileNames:
            # 过滤
            if fullName.startswith('.'):  # 去除隐藏文件
                continue
            if fullName.lower().startswith('readme'):
                continue

            if ext:  # 根据后缀名搜索
                if fullName.endswith(tuple(ext)):
                    i+=1
                    name = fullName.rsplit(".",1)[0]
                    content = f"""---
title: {name}
order: {i}
category:
  - AE表达式
---
"""
                    filePath = os.path.join(parent, fullName)
                    with open(filePath,"r+", encoding='utf-8') as f:
                        ff = f.read()
                        f.seek(0)
                        f.truncate()
                        f.write(content + ff)

            else:
                ...

def getCat(URL):
    ...
    names = []
    links = []

    htmlfile = requests.get(URL)
    htmlfile.encoding = 'utf-8'
    soup = BeautifulSoup(htmlfile.text, 'html.parser')
    cats = soup.findAll(class_='docs-single-cat-wrap')
    for i in cats:
        print(i.text)
        link = i.findAll("a")
        print(str(link))
        # os.mkdir(i.text)
        # print (i.text)

def getContent(url):
    """
    :param url:
    :return: text
    """

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    path = "E:\Project\docs_yuelili_com\docs\zh\exp"
    # addTitle(path,"md")

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

URL = "https://www.yuelili.com/docs/ae-effect/arithmetic/"

getCat(URL)
# text_maker = ht.HTML2Text()
# text_maker.bypass_tables = False
# #
# htmlfile = requests.get(URL)
# htmlfile.encoding = 'utf-8'
# soup = BeautifulSoup(htmlfile.text,'html.parser')
# content = str(soup.find(class_='betterdocs-content'))
#
# text = text_maker.handle(content)
#
#
# with open("12121.md","w+",encoding="utf-8") as f:
#     f.write(text)

给TA充电
共{{data.count}}人
人已充电
编程

[HTML进阶]页面渲染顺序

2022-8-24 12:32:43

编程

[前端]夜间模式简单实现

2022-11-23 22:54:36

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧
个人中心
今日签到
搜索