需求:爬取投诉帖子的名称、帖子的url、帖子的标题,和帖子里的内容。
1.规则爬虫--scrapy genspider -t crawl Question wz.sun0769.com
**Question .py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from Dongguan.items import DongguanItem class QuestionSpider(CrawlSpider): name = 'Question' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0'] rules = ( # Rule规则里面如果没有写Rule,默认是深度爬取 # 所以帖子的页面的数据 Rule(LinkExtractor(allow=r'type=4'), follow=True), # 下一页的匹配 Rule(LinkExtractor(allow=r'question/\d+/\d+.shtml'), process_links="handle_links", callback='parse_item', follow=True), ) # 把有错误的链接,可以修改过来,再去请求 def handle_links(self, links): for link in links: print("link====", link) return links # 帖子的详细信息 def parse_item(self, response): item = DongguanItem() # 帖子链接 url = response.url title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract() if len(title_number) > 0: title_number = title_number[0] # 编号:191166 # 帖子的编号 number = title_number.split("\xa0\xa0")[1] number = number.split(":")[1] # 帖子标题 title = title_number.split("\xa0\xa0")[0] title = title.split(":")[1] item["title"] = title item["number"] = number content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract() # 把列表使用“”链接变成字符串 content = "".join(content).strip() item["url"] = url item["content"] = content yield item
2.Spider版爬虫---scrapy genspider Question2 wz.sun0769.com
**Question2 .py
import scrapy from Dongguan.items import DongguanItem class Question2Spider(scrapy.Spider): name = 'Question2' allowed_domains = ['wz.sun0769.com'] # 偏移 offset = 0 url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page=" start_urls = [url + str(offset)] # 就是帖子具体的内容了 def process_item(self, response): item = DongguanItem() # 帖子链接 url = response.url title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract() if len(title_number) > 0: title_number = title_number[0] # 编号:191166 # 帖子的编号 number = title_number.split("\xa0\xa0")[1] number = number.split(":")[1] # 帖子标题 title = title_number.split("\xa0\xa0")[0] title = title.split(":")[1] item["title"] = title item["number"] = number content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract() # 把列表使用“”链接变成字符串 content = "".join(content).strip() item["url"] = url item["content"] = content yield item def parse(self, response): # 得到某一页的所以的帖子的链接 current_page_link = response.xpath('//a[@class="news14"]/@href').extract() print(current_page_link) for link in current_page_link: # 添加具体的帖子链接,让其帮我请求 yield scrapy.Request(link, callback=self.process_item) # 拼接下一页 if self.offset < 93630: self.offset += 30 # 下一页的链接 new_url = self.url + str(self.offset) yield scrapy.Request(new_url, callback=self.parse)
3.CrawlSpider(规则爬虫)和Spider版爬虫通用的----pipelines.py
import json class DongguanPipeline(object): def open_spider(self, spider): # 创建文件 self.file = open(spider.name + ".json", "w", encoding="utf-8") def process_item(self, item, spider): # python字典 python_dict = dict(item) # python的str python_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(python_str) return item def close_spider(self, spider): self.file.close()
4.CrawlSpider(规则爬虫)和Spider版爬虫通用的----item.py
import scrapy class DongguanItem(scrapy.Item): # define the fields for your item here like: # 每个帖子的标题 title = scrapy.Field() # 每个帖子的编号 number = scrapy.Field() # 每个帖子的内容 content = scrapy.Field() # 每个帖子的链接 url = scrapy.Field()
5.CrawlSpider(规则爬虫)和Spider版爬虫通用的----settings.py
# 爬虫的协议
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = { 'Dongguan.pipelines.DongguanPipeline': 300, } # 设置日志 LOG_FILE = "dongguan.log" LOG_LEVEL = "DEBUG" # 设置用户代理 USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
来源:oschina
链接:https://my.oschina.net/u/3892643/blog/1844850