在 使用爬虫爬取豆瓣电影影评数据Java版 一文中已详细讲解了爬虫的实现细节,本篇仅为展示Python版本爬虫实现,所以直接上代码
完整代码
爬虫主程序
# 爬虫启动入口
from C02.data import processor
from C02.page import downloader, parser
from C02.url import manager
class Spider(object):
def __init__(self):
self.url_manager = manager.Manager()
self.page_downloader = downloader.Downloader()
self.page_parser = parser.Parser(base_url='https://movie.douban.com/subject/26752088/comments')
self.data_processor = processor.Processor(host='192.168.0.105', collection='movie_26752088_comments')
def start(self, root_url):
"""
启动爬虫方法
:param root_url: 启动URL
:return: 抓取的URL数量
"""
nums = 0
self.url_manager.append_new_urls([root_url])
while self.url_manager.has_new_url():
nums += 1
new_url = self.url_manager.get_new_url()
print('开始下载第{:03}个URL:{}'.format(nums, new_url))
html = self.page_downloader.download(new_url)
if html is None:
print('html is empty .')
continue
links, results = self.page_parser.parse(html)
if len(links) > 0:
self.url_manager.append_new_urls(links)
if len(results) > 0:
self.data_processor.process(results)
return nums
if __name__ == "__main__":
spider = Spider()
nums = spider.start("https://movie.douban.com/subject/26752088/comments?start=0&limit=20&sort=new_score&status=P")
print('爬虫执行完成,共抓取{}个URL'.format(nums))
URL管理器
def url_checker(fn):
"""
URL合法性检查器
:param fn:
:return:
"""
def wrapper(obj, urls):
# url 格式检查,这里主要检查url部分是否是:
# https://movie.douban.com/subject/26752088/comments
# 本次任务只抓取该电影影评,所以忽略其它URL
# 直接在原urls列表上修改遇到问题(&percent_type=部分移除不掉),所以直接用一个新列表重新装入url列表
lst = []
for url in urls:
# 将不匹配的URL从列表中删除
if '/subject/26752088/comments' not in url:
continue
# 将url中的 &percent_type= 参数移除,避免造成重复抓取(有些链接带这个参数有些不带)
if '&percent_type=' in url:
url = url.replace('&percent_type=', '')
lst.append(url)
return fn(obj, lst)
return wrapper
class Manager(object):
"""
单机URL管理器
"""
def __init__(self):
self.new_urls = []
self.old_urls = []
@url_checker
def append_new_urls(self, urls):
if len(urls) == 0:
return
for url in urls:
if url in self.new_urls or url in self.old_urls:
continue
else:
self.new_urls.append(url)
def has_new_url(self):
return len(self.new_urls) > 0
def get_new_url(self):
"""
获取一个新的URL,内部隐含了URL抓取过后加入已抓取队列操作(所以这里不考虑实际抓取过程中的失败情况)
:return:
"""
url = self.new_urls.pop()
self.old_urls.append(url)
return url
网页下载器
import requests
class Downloader(object):
@staticmethod
def download(url):
try:
# 如果不登录抓取的数据会很有限,这里简化处理认证部分逻辑,直接把我的cookie信息复制过来
resp = requests.get(url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie': 'gr_user_id=b6c0778d-f8df-4963-b057-bd321593de1e; bid=T-M5aFmoLY0; __yadk_uid=WvMJfSHd1cjUFrFQTdN9KnkIOkR2AFZu; viewed="26311273_26877306_26340992_26649178_3199438_3015786_27038473_10793398_26754665"; ll="108296"; as="https://movie.douban.com/subject/26752088/comments?start=60&limit=20&sort=new_score&status=P"; ps=y; dbcl2="141556470:E4oz3is9RMY"; ck=OvCX; _vwo_uuid_v2=E57494AA9988242B62FB576F22211CE4|e95afc3b3a6c74f0b9d9106c6546e73e; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1531194535%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D0saOVVzXJiEvkbYGxCXZ849EweAjA2om6cIvPZ7FxE35FrmKU8CfOHm1cC9Xs0JS%26wd%3D%26eqid%3De5307bbf0006c241000000045addc33f%22%5D; ap=1; _pk_id.100001.4cf6=cee42334e421195b.1522208966.4.1531200264.1531191315.; _pk_ses.100001.4cf6=*; __utma=30149280.1283677058.1481968276.1531190383.1531194536.34; __utmb=30149280.0.10.1531194536; __utmc=30149280; __utmz=30149280.1524482884.31.29.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.14155; __utma=223695111.1691619874.1522208966.1531190383.1531194536.4; __utmb=223695111.0.10.1531194536; __utmc=223695111; __utmz=223695111.1524483025.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; push_noty_num=0; push_doumail_num=0'
},
timeout=3.0)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
print(e)
网页解析器
from bs4 import BeautifulSoup
from urllib import parse
class Parser(object):
def __init__(self, base_url=None):
self.base_url = base_url
def parse(self, html):
soup = BeautifulSoup(html, 'html.parser')
# 超链接列表
links = []
for a in soup.select('#paginator > a'):
if self.base_url is not None:
links.append(parse.urljoin(self.base_url, a.get('href')))
else:
links.append(a.get('href'))
# 数据列表
results = []
for div in soup.select('#comments > div.comment-item'):
author = div.select_one('h3 > span.comment-info > a').get_text(strip=True)
date = div.select_one('h3 > span.comment-info > span.comment-time').get_text(strip=True)
rating = div.select_one('h3 > span.comment-info > span.rating')
star = None
if rating is not None:
star = rating.get('class')[0].replace('allstar', '')
vote = div.select_one('h3 > span.comment-vote > span.votes').get_text(strip=True)
comment = div.select_one('div.comment > p').get_text(strip=True)
results.append({
'author': author,
'date': date,
'star': star,
'vote': vote,
'comment': comment
})
return links, results
数据处理器
import pymongo
class Processor(object):
def __init__(self, host=None, port=27017, database='douban', collection='movie_26752088_comments'):
self.client = pymongo.MongoClient(host=host, port=port)
self.database = database
self.collection = collection
def __del__(self):
self.client.close()
def process(self, results):
comments = self.client.get_database(self.database).get_collection(self.collection)
return comments.insert(results)
结语
纯属练手作,勿喷
来源:oschina
链接:https://my.oschina.net/u/1023355/blog/1845479