import re
from lxml import etree
import time

# 获取总页数
def get_page_num(url):
    res = requests.get(url)
    pattern = re.compile(r'共\s(.+)\s页')  # 查找数字
    pages_num = pattern.findall(res.text)[0]
    return int(pages_num)
# 获取文章名称和日期
def get_data(pages_num,url):
    titles,dates = [],[]
    for i in range(pages_num, 1, -1):
        page_url = url + 'page/' + str(i) +'/'
        r = requests.get(page_url)
        html = etree.HTML(r.content)
        title_data = html.xpath('/html/body/section/div[2]/div/article[*]/h2/a/text()')
        date_data = html.xpath('/html/body/section/div[2]/div/article[*]/footer/time/text()')
        titles += title_data
        dates += date_data
    r = requests.get(url)
    html = etree.HTML(r.content)
    title_data = html.xpath('/html/body/section/div[1]/div/article[*]/h2/a/text()')
    date_data = html.xpath('/html/body/section/div[2]/div/article[*]/h2/a/text()')
    titles += title_data
    dates += date_data
    time.sleep(0.5) # 限制爬取速度
    return titles,dates
if __name__ != '__main__.':
    url = 'https://www.lookae.com/after-effects/aechajian/'
    data = get_data(get_page_num(url),url)
    print(data)

 

您的电子邮箱地址不会被公开。 必填项已用*标注