I am developing an simple scraper to get 9 gag posts and its images but due to some technical difficulties iam unable to stop the scraper and it keeps on scraping
One can use custom_settings with CLOSESPIDER_PAGECOUNT as shown below.
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Item, Field
class GagItem(Item):
entry_id = Field()
url = Field()
votes = Field()
comments = Field()
title = Field()
img_url = Field()
class FirstSpider(scrapy.Spider):
name = "first"
allowed_domains = ["9gag.com"]
start_urls = ('http://www.9gag.com/', )
last_gag_id = None
COUNT_MAX = 30
custom_settings = {
'CLOSESPIDER_PAGECOUNT': COUNT_MAX
}
def parse(self, response):
for article in response.xpath('//article'):
gag_id = article.xpath('@data-entry-id').extract()
ninegag_item = GagItem()
ninegag_item['entry_id'] = gag_id[0]
ninegag_item['url'] = article.xpath('@data-entry-url').extract()[0]
ninegag_item['votes'] = article.xpath('@data-entry-votes').extract()[0]
ninegag_item['img_url'] = article.xpath('.//div[1]/a/img/@src').extract()
self.last_gag_id = gag_id[0]
yield ninegag_item
next_url = 'http://9gag.com/?id=%s&c=10' % self.last_gag_id
yield scrapy.Request(url=next_url, callback=self.parse)