抓取地址:http://maoyan.com/board/4
分析url分页规则:http://maoyan.com/board/4?offset=0
其中offset参数值为0到90
用到的库:
PyQuery:html解析器,语法类似jquery
fake_useragent;用于伪造头部浏览器信息,防爬虫屏蔽
相关代码:
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fake_useragent import UserAgent
from multiprocessing import Pool
import json
def gethtml(offset):
try:
ua = UserAgent()
#防爬虫机制,加入头部信息
headerinfo={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept - Language": "zh - CN, zh;q = 0.9",
"Cookie":"uuid=1A6E888B4A4B29B16FBA1299108DBE9CE735E380ECAF25EF34C1CC12335D50D6; _csrf=e8acf6c957d5647db54a08ec5ec00849f7c0ef59dffcfa41e04822d7d8c5a730; _lxsdk_cuid=162a8e9c82bc8-0047562353d544-5e163117-1fa400-162a8e9c82bc8; _lxsdk=1A6E888B4A4B29B16FBA1299108DBE9CE735E380ECAF25EF34C1CC12335D50D6; __mta=56065556.1523252316455.1523252362387.1523252369506.6; _lxsdk_s=162a8e9c82b-699-947-0cd%7C%7C14",
"Host": "maoyan.com",
"Proxy-Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent":ua.random
}
respsonse= requests.get("http://maoyan.com/board/4?offset="+str(offset), headers=headerinfo)
if respsonse.status_code==200:
return respsonse.text
return None
except RequestException as ex :
print(ex)
return None
def getContent(offset):
result=gethtml(offset)
p=pq(result)
for item in p(".board-wrapper dd").items():
print(item('.name').text())
write_to_file({"title":item('.name').text(),
"actor":item('.star').text()[3:],
"releasetime":item('.releasetime').text()[5:],
"score":item('.score').text()})
##写入到文件中
def write_to_file(content):
with open('maoyan.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+"\n")
f.close()
if __name__ == '__main__':
#开启线程池,使用多线程抓取
p=Pool()
p.map(getContent,[i*10 for i in range(0,10)])
来源:oschina
链接:https://my.oschina.net/u/4411208/blog/4015799