I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selecto
The idea is to increment pageNumber
until there is no titles
found. If no titles
on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[@class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[@class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[@class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[@class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[@class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[@class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[@class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[@class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.