How to stop scrapy spider after certain number of requests?

后端 未结 5 643
情书的邮戳
情书的邮戳 2021-02-04 20:01

I am developing an simple scraper to get 9 gag posts and its images but due to some technical difficulties iam unable to stop the scraper and it keeps on scraping

5条回答
  •  渐次进展
    2021-02-04 20:47

    One can use custom_settings with CLOSESPIDER_PAGECOUNT as shown below.

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy import Item, Field
    
    
    class GagItem(Item):
        entry_id = Field()
        url = Field()
        votes = Field()
        comments = Field()
        title = Field()
        img_url = Field()
    
    
    class FirstSpider(scrapy.Spider):
    
        name = "first"
        allowed_domains = ["9gag.com"]
        start_urls = ('http://www.9gag.com/', )
        last_gag_id = None
    
        COUNT_MAX = 30
    
        custom_settings = {
            'CLOSESPIDER_PAGECOUNT': COUNT_MAX
        }
    
        def parse(self, response):
    
            for article in response.xpath('//article'):
                gag_id = article.xpath('@data-entry-id').extract()
                ninegag_item = GagItem()
                ninegag_item['entry_id'] = gag_id[0]
                ninegag_item['url'] = article.xpath('@data-entry-url').extract()[0]
                ninegag_item['votes'] = article.xpath('@data-entry-votes').extract()[0]
                ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/@src').extract()
                self.last_gag_id = gag_id[0]
                yield ninegag_item
    
                next_url = 'http://9gag.com/?id=%s&c=10' % self.last_gag_id
                yield scrapy.Request(url=next_url, callback=self.parse)
    

提交回复
热议问题