依赖
爬取汽车之家用到了Python的两个库:
- requests:模拟浏览器发送请求
- BeautifulSoup4:解析爬取的数据
这两个库都需要我们手动下载:
pip install requests pip install BeautifulSoup4
简单爬取汽车之家新闻页首页
import os import requests from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/news/') # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue # print(title, title.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 img_content = requests.get(url=img) img_name = img.rsplit('/', 1)[-1] file_path = os.path.join(base_dir, 'img', img_name) with open(file_path, 'wb') as f: f.write(img_content.content) if __name__ == '__main__': spider()
爬取新闻页前一百页
import os import time import requests from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(page): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page) # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 # img_content = requests.get(url=img) # img_name = img.rsplit('/', 1)[-1] # file_path = os.path.join(base_dir, 'img', img_name) # with open(file_path, 'wb') as f: # f.write(img_content.content) if __name__ == '__main__': start_time = time.time() for i in range(1, 101): spider(i) print('顺序爬取100页共耗时', time.time() - start_time) # 99.59376955032349
多线程爬取汽车之家新闻页前100页
import os import time import requests from threading import Thread from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(page): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page) # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 # img_content = requests.get(url=img) # img_name = img.rsplit('/', 1)[-1] # file_path = os.path.join(base_dir, 'img', img_name) # with open(file_path, 'wb') as f: # f.write(img_content.content) if __name__ == '__main__': # spider(1) start_time = time.time() for i in range(1, 101): t = Thread(target=spider, args=(i, )) t.start() print('多线程爬取100页共耗时', time.time() - start_time) # 0.17073273658752441
线程池爬取汽车之家新闻页前100页
import os import time import requests from concurrent.futures import ThreadPoolExecutor from multiprocessing import cpu_count from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(page): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page) # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 # img_content = requests.get(url=img) # img_name = img.rsplit('/', 1)[-1] # file_path = os.path.join(base_dir, 'img', img_name) # with open(file_path, 'wb') as f: # f.write(img_content.content) if __name__ == '__main__': start_time = time.time() t = ThreadPoolExecutor(cpu_count() * 5) for i in range(1, 101): t.submit(spider, i) t.shutdown(wait=True) print('线程池爬取100页共耗时', time.time() - start_time) # 36.4789092540741
进程池爬取汽车之家新闻页前100页
import os import time import requests from concurrent.futures import ProcessPoolExecutor from multiprocessing import cpu_count from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(page): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page) # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 # img_content = requests.get(url=img) # img_name = img.rsplit('/', 1)[-1] # file_path = os.path.join(base_dir, 'img', img_name) # with open(file_path, 'wb') as f: # f.write(img_content.content) if __name__ == '__main__': start_time = time.time() p = ProcessPoolExecutor(cpu_count() * 2) for i in range(1, 101): p.submit(spider, i) p.shutdown(wait=True) print('进程池爬取100页共耗时', time.time() - start_time) # 32.66965293884277
进程池和线程池其实在合理的设置范围内爬取速度差别不大,甚至线程池更快一些,上例的最后打印的时间差距可以忽略不计,并且受网速影响。
混爬汽车之家好多页
import os import time import requests from concurrent.futures import ProcessPoolExecutor from multiprocessing import cpu_count from bs4 import BeautifulSoup base_dir = os.path.dirname(__file__) def spider(page): '''基础版爬取汽车之家新闻页''' response = requests.get(url='https://www.autohome.com.cn/%s/%s/#liststart' % (page[1], page[0])) # print(response) # 状态码 # print(response.status_code) # 状态码 # print(response.headers) # 响应头 # print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312 response.encoding = 'gbk' # 解决乱码 # print(response.text) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据 # print(result) li_list = result.find_all(name='li') # print(li_list[0]) for item in li_list: # 取标题 title_tag = item.find(name='h3') if not title_tag: continue print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布 title = title_tag.text # 取简介 introduction = item.find(name='p').text # print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基... url = 'https:' + item.find(name='a').get('href') # print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624 img = 'https:' + item.find(name='img').get('src') # print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件 # img_content = requests.get(url=img) # img_name = img.rsplit('/', 1)[-1] # file_path = os.path.join(base_dir, 'img', img_name) # with open(file_path, 'wb') as f: # f.write(img_content.content) if __name__ == '__main__': start_time = time.time() p = ProcessPoolExecutor(cpu_count() * 2) for item in ['news', 'advice', 'drive', 'use', 'culture', 'travels', 'tech', 'tuning', 'ev']: for i in range(1, 101): p.submit(spider, (i, item),) p.shutdown(wait=True) print('共耗时', time.time() - start_time) # 418.42672753334045,结果有点抠脚啊