- 增量式 - 概念:用于监测网站数据更新的情况。 - 核心机制:去重。redis的set实现去重- 总结反爬机制: - robots - UA伪装 - 验证码 - 代理 - cookie - 动态变化的请求参数 - js加密 - js混淆 - 图片懒加载 - 动态数据的捕获 - seleium:规避检测1、创建scrapy startproject 项目名2、创建虫子scrapy genspider -t crawl zjs www.xxx.com3、配置文件UA伪装、日志等级、robots4、虫子里面解析电影名称、详情页url、描述5、items里面配置6、虫子里面导入items7、pipelines管道里面配置输入数据8、配置文件开启管道9、执行虫子scrapy crawl zjs10、redis启动客户端:redis-cli.exe 查询所有的keys:keys * 查询长度:llen moiveData 查询数据:smembers movie_detail_urls 清空:flushall
zjs.py
# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom redis import Redisfrom zjsPro1.items import Zjspro1Itemclass ZjsSpider(CrawlSpider): conn = Redis(host='127.0.0.1', port=6379) name = 'zjs' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/index.php/vod/show/class/%E7%88%B1%E6%83%85/id/1.html'] rules = ( Rule(LinkExtractor(allow=r'/page/\d+\.html'), callback='parse_item', follow=False), ) def parse_item(self, response): # 解析电影名称和详情页url: li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567kan.com/' + li.xpath('./div/a/@href').extract_first() item = Zjspro1Item() item['name'] = name # 可以将爬过的电影的详情页的url记录起来 # ex == 0:数据插入失败 ex==1:数据插入成功 ex = self.conn.sadd("movie_detail_urls", detail_url) if ex == 1: print('捕获到最新更新出来的数据!') yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) else: print('暂无数据的更新!!!') def parse_detail(self, response): # 解析描述 desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first() item = response.meta['item'] item['desc'] = desc yield item
来源:https://www.cnblogs.com/zhang-da/p/12444118.html