import re
from lxml import etree
import time
# 获取总页数
def get_page_num(url):
res = requests.get(url)
pattern = re.compile(r'共\s(.+)\s页') # 查找数字
pages_num = pattern.findall(res.text)[0]
return int(pages_num)
# 获取文章名称和日期
def get_data(pages_num,url):
titles,dates = [],[]
for i in range(pages_num, 1, -1):
page_url = url + 'page/' + str(i) +'/'
r = requests.get(page_url)
html = etree.HTML(r.content)
title_data = html.xpath('/html/body/section/div[2]/div/article[*]/h2/a/text()')
date_data = html.xpath('/html/body/section/div[2]/div/article[*]/footer/time/text()')
titles += title_data
dates += date_data
r = requests.get(url)
html = etree.HTML(r.content)
title_data = html.xpath('/html/body/section/div[1]/div/article[*]/h2/a/text()')
date_data = html.xpath('/html/body/section/div[2]/div/article[*]/h2/a/text()')
titles += title_data
dates += date_data
time.sleep(0.5) # 限制爬取速度
return titles,dates
if __name__ != '__main__.':
url = 'https://www.lookae.com/after-effects/aechajian/'
data = get_data(get_page_num(url),url)
print(data)