CrawlSpider（规则爬虫）和Spider版爬虫

需求：爬取投诉帖子的名称、帖子的url、帖子的标题，和帖子里的内容。

1.规则爬虫--scrapy genspider -t crawl Question wz.sun0769.com

**Question .py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from Dongguan.items import DongguanItem


class QuestionSpider(CrawlSpider):
    name = 'Question'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
    rules = (
        # Rule规则里面如果没有写Rule,默认是深度爬取
        # 所以帖子的页面的数据
        Rule(LinkExtractor(allow=r'type=4'), follow=True),  # 下一页的匹配
        Rule(LinkExtractor(allow=r'question/\d+/\d+.shtml'), process_links="handle_links", callback='parse_item',
             follow=True),
    )

    # 把有错误的链接，可以修改过来，再去请求
    def handle_links(self, links):
        for link in links:
            print("link====", link)
        return links

    # 帖子的详细信息
    def parse_item(self, response):
        item = DongguanItem()
        # 帖子链接
        url = response.url
        title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
        if len(title_number) > 0:
            title_number = title_number[0]
            # 编号:191166
            # 帖子的编号
            number = title_number.split("\xa0\xa0")[1]
            number = number.split(":")[1]
            # 帖子标题
            title = title_number.split("\xa0\xa0")[0]
            title = title.split("：")[1]
            item["title"] = title
            item["number"] = number
        content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
        # 把列表使用“”链接变成字符串
        content = "".join(content).strip()
        item["url"] = url
        item["content"] = content
        yield item

2.Spider版爬虫---scrapy genspider Question2 wz.sun0769.com

**Question2 .py

import scrapy
from Dongguan.items import DongguanItem


class Question2Spider(scrapy.Spider):
    name = 'Question2'
    allowed_domains = ['wz.sun0769.com']
    # 偏移
    offset = 0
    url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page="
    start_urls = [url + str(offset)]
    # 就是帖子具体的内容了
    def process_item(self, response):
        item = DongguanItem()
        # 帖子链接
        url = response.url
        title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
        if len(title_number) > 0:
            title_number = title_number[0]
            # 编号:191166
            # 帖子的编号
            number = title_number.split("\xa0\xa0")[1]
            number = number.split(":")[1]
            # 帖子标题
            title = title_number.split("\xa0\xa0")[0]
            title = title.split("：")[1]
            item["title"] = title
            item["number"] = number
        content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
        # 把列表使用“”链接变成字符串
        content = "".join(content).strip()
        item["url"] = url
        item["content"] = content
        yield item

    def parse(self, response):
        # 得到某一页的所以的帖子的链接
        current_page_link = response.xpath('//a[@class="news14"]/@href').extract()
        print(current_page_link)
        for link in current_page_link:
            # 添加具体的帖子链接，让其帮我请求
            yield scrapy.Request(link, callback=self.process_item)
        # 拼接下一页
        if self.offset < 93630:
            self.offset += 30
        # 下一页的链接
        new_url = self.url + str(self.offset)
        yield scrapy.Request(new_url, callback=self.parse)

3.CrawlSpider（规则爬虫）和Spider版爬虫通用的----pipelines.py

import json


class DongguanPipeline(object):
    def open_spider(self, spider):
        # 创建文件
        self.file = open(spider.name + ".json", "w", encoding="utf-8")

    def process_item(self, item, spider):
        # python字典
        python_dict = dict(item)
        # python的str
        python_str = json.dumps(python_dict, ensure_ascii=False) + "\n"
        self.file.write(python_str)
        return item

    def close_spider(self, spider):
        self.file.close()

4.CrawlSpider（规则爬虫）和Spider版爬虫通用的----item.py

import scrapy


class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    # 每个帖子的标题
    title = scrapy.Field()
    # 每个帖子的编号
    number = scrapy.Field()
    # 每个帖子的内容
    content = scrapy.Field()
    # 每个帖子的链接
    url = scrapy.Field()

5.CrawlSpider（规则爬虫）和Spider版爬虫通用的----settings.py

# 爬虫的协议
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'Dongguan.pipelines.DongguanPipeline': 300,
}
# 设置日志
LOG_FILE = "dongguan.log"
LOG_LEVEL = "DEBUG"
# 设置用户代理
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

来源：oschina

链接：https://my.oschina.net/u/3892643/blog/1844850

标签

crawl

scrapy