import requests
from lxml import etree

def get_list(pages):
    '''
    :param pages: 爬取多少页
    :return:所有页数的链接
    '''
    print('开始获取分页信息')
    url_list = []
    for i in range(pages):
        url_list.append('https://aescripts.com/after-effects/?p='+str(i+1))
    return url_list

def get_info(url_list,img_out_path):
    '''
    :param url_list: 所有分页的url列表
    :return: 单页所有商品的信息
    '''
    page_info = []
    lens = len(url_list)
    for i in range(lens):
        try:
            print('正在获取:' + url_list[i])
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
            }
            r = requests.get(url=url_list[i], headers=headers)
            html = etree.HTML(r.text)

            for j in range(12):
                print('提取分页信息...')
                try:
                    url = html.xpath('//*[@id="products-gallery"]/ul/li[' +str(j+1) +']/div/div/a/@href')[0]
                    author =html.xpath('//*[@class="author"]/text()')[j].strip()
                    price = html.xpath('//*[@class="price"]/text()')[j].strip()
                    page_info.append([url,author,price])
                    page_info[i*12 + j].extend(get_product_info(url,img_out_path))
                    # 链接,作者,价格,描述,名称,图片下载,视频介绍,版本范围,版本号
                except :
                    continue
        except :
            print('未抓取成功:' + url_list[i])
            continue

    return page_info

def get_product_info(url_product,img_out_path):
    print('开始获取单个插件信息:' + url_product)
    products_data = []
    if url_product == 'https://aescripts.com/handycam/':
        pass
    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
        }
        r = requests.get(url=url_product, headers=headers)
        html = etree.HTML(r.text)

        try:
            name = html.xpath('//*[@class="page-title product-name"]/h1/text()')[0]
            desc = html.xpath('//*[@class="short-description"]/div/p/text()')[0].strip()
            desc = "".join(desc.split()) # 去除不可见 \xa0

            img = html.xpath('//*[@id="image"]/@src')[0]
            imgDown(img_out_path,img,name)
            video = 'https://www.youtube.com/watch?v=' + html.xpath('//*[@id="product_tabs_description_contents"]/div//@id')[0]
            allow = html.xpath('//*[@id="product-attribute-specs-table"]/tbody/tr/td/text()')[0]

            vision = html.xpath('//*[@class="downloadplus-product-history-entry"]/h4/text()')[0].strip()
            vision = "".join(vision.split()) # 去除不可见 \xa0
            products_data.extend([name,desc,img,video,allow,vision])
        except Exception as e:
            print(e)
    return products_data


def imgDown(dir,url,name):
    print('下载图片')
    try:
        pic = requests.get(url=url, timeout=10)
    except requests.exceptions.ConnectionError:
        print('图片无法下载')

    outfile = dir + '/' + name + '.jpg'
    fp = open(outfile, 'wb')
    fp.write(pic.content)
    fp.close()

def write_in(dataset,data_path):
    import csv
    with open(data_path, "w+",encoding='utf-8') as fileOutput:
        writer = csv.writer(fileOutput,lineterminator='\n')
        # writer.writerow(header)
        print(dataset)
        writer.writerows(dataset)
        print("保存成功")

pages= 20 # 前多少页
img_path = r'G:\back\pyfile\爬虫与数据处理\img'  # 下载图片文件夹
data_path = r'G:\back\pyfile\爬虫与数据处理\data.csv' # 保存csv的地址

iff = get_info(get_list(pages),img_path)
write_in(iff)

 

您的电子邮箱地址不会被公开。 必填项已用*标注