python之多线程图片抓取

落花浮王杯 提交于 2020-03-09 01:44:14

站点目标:https://www.2717.com/

编写脚本:

import os
import threading
import time

import requests
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',

}

def do_request(url, tries=3):
    # 添加重试机制
    i = 0
    while i < tries:
        try:
            res = requests.get(url, timeout=5)
            return res
        except requests.exceptions.RequestException as e:
            i += 1
            print('request error.', e)

    return False

def download_local(url, img_file):
    '''
    下载图片到本地
    :param url: 远程图片地址
    :param img_file: 本地存储地址
    '''

    res = do_request(url)
    if res is False:
        print(f'请求失败:', url)
        return

    img = open(img_file, 'wb')
    img.write(res.content)
    img.close()


def save_img(url, name, page):
    '''
    保存图片
    :param url: 远程图片地址
    :param name: 图片名称
    :param page: 页码
    '''
    print(url, name)
    # 目录不存在,则创建
    img_path = f'./imgs/{page}/'
    if not os.path.isdir(img_path):
        os.mkdir(img_path, mode=755)

    img_ext = os.path.splitext(url)[1] # 获取图片后缀
    img_file = f'{img_path}{name}.{img_ext}'
    print(img_file)

    # 文件已经存在,无需下载
    if os.path.isfile(img_file):
        print(f'img is exist. img: {img_file}')
        return True

    download_local(url, img_file)



def crawl_images(page):
    '''
    抓取图片
    :param page: 分页
    '''
    url = f'https://www.2717.com/ent/meinvtupian/list_11_{page}.html'
    print(url)
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding # 避免乱码(默认iso-8859-1)
    # print(response.text)
    soup = BeautifulSoup(response.text, 'lxml')
    lists = soup.find('div', class_='MeinvTuPianBox').find_all('a', class_='MMPic')
    for item in lists:
        print('###'*20)
        img_url = item.find('img')['src']
        img_name = item.find('img')['alt']
        # save_img(img_url, img_name)
        # 多线程,加快抓取
        t = threading.Thread(target=save_img, args=[img_url, img_name, page])
        t.start()



def main():

    for page in range(251, 1, -1):
    # for page in [34]:
        print(f'current page: {page}')
        crawl_images(page)
        time.sleep(0.01)



if __name__ == '__main__':
    print('=============== crawl images start ==============')
    start = time.time()
    main()
    print('=============== crawl images end ==============')
    print('Time usage:{0:.3f}'.format(time.time()-start))

 

抓取效果:

 

 

 

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!