抓取猫眼部分信息

陌路散爱 提交于 2020-11-22 06:31:28

import requests import re import json from multiprocessing import Pool

def get_one_page(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } res = requests.get(url,headers=header) if res.status_code == 200: return res.text else: return None

def parse_html(html): html.replace('\n','') pattern = ('<i class="board-index.*?">(.?)</i>.?<img data-src="(.?)".?</a>.?<a href.?>(.?)</a>.?<p class="star"(.?)</p>.?releasetime">(.?)</p>.?integer">(.?)</i>.?fraction">(.*?)</i>') items = re.findall(pattern,html,re.S) for item in items: yield { 'index':item[0], 'img':item[1], 'title':item[2], 'direct':item[4], 'time':item[5], 'pinfen':item[6]+item[7] }

def write_to_file(content): with open('1.txt','a',encoding='utf-8')as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close()

def main(offset): url = 'http://maoyan.com/board/4?offset='+str(offset) html = get_one_page(url) for item in parse_html(html): write_to_file(item)

if name == 'main': pool = Pool.map(main,[i*10 for i in range(10)])

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!