Scrapy crawl with next page

前端 未结 2 1608
失恋的感觉
失恋的感觉 2020-12-18 03:49

I have this code for scrapy framework:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkEx         


        
相关标签:
2条回答
  • 2020-12-18 04:28

    Your rule is not used because you don't use a CrawlSpider.

    So you have to create the next page requests manually like so:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.contrib.spiders import Rule
    from scrapy.linkextractors import LinkExtractor
    from lxml import html
    
    class Scrapy1Spider(scrapy.Spider):
        name = "craiglist"
        allowed_domains = ["sfbay.craigslist.org"]
        start_urls = (
            'http://sfbay.craigslist.org/search/npo',
        )
    
        Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
    
        def parse(self, response):
            site = html.fromstring(response.body_as_unicode())
            titles = site.xpath('//div[@class="content"]/p[@class="row"]')
            print len(titles), 'AAAA'
    
            # follow next page links
            next_page = response.xpath('.//a[@class="button next"]/@href').extract()
            if next_page:
                next_href = next_page[0]
                next_page_url = 'http://sfbay.craigslist.org' + next_href
                request = scrapy.Request(url=next_page_url)
                yield request
    

    Or use the CrawlSpider like so:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    from lxml import html
    
    class Scrapy1Spider(CrawlSpider):
        name = "craiglist"
        allowed_domains = ["sfbay.craigslist.org"]
        start_urls = (
            'http://sfbay.craigslist.org/search/npo',
        )
    
        rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)
    
        def parse_page(self, response):
            site = html.fromstring(response.body_as_unicode())
            titles = site.xpath('//div[@class="content"]/p[@class="row"]')
            print len(titles), 'AAAA'
    
    0 讨论(0)
  • 2020-12-18 04:32

    try this

      next_page_url = response.xpath('//a[@class="button next"]').extract_first()
    
      if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
    
    0 讨论(0)
提交回复
热议问题