Scrapy crawl with next page

前端未结

关注

 2  1608

I have this code for scrapy framework:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkEx


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  野趣味        
                
              
                            
                2020-12-18 04:28
              
            
            
                                                                       
Your rule is not used because you don't use a CrawlSpider.

So you have to create the next page requests manually like so:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html

class Scrapy1Spider(scrapy.Spider):
    name = "craiglist"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )

    Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)

    def parse(self, response):
        site = html.fromstring(response.body_as_unicode())
        titles = site.xpath('//div[@class="content"]/p[@class="row"]')
        print len(titles), 'AAAA'

        # follow next page links
        next_page = response.xpath('.//a[@class="button next"]/@href').extract()
        if next_page:
            next_href = next_page[0]
            next_page_url = 'http://sfbay.craigslist.org' + next_href
            request = scrapy.Request(url=next_page_url)
            yield request


Or use the CrawlSpider like so:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html

class Scrapy1Spider(CrawlSpider):
    name = "craiglist"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)

    def parse_page(self, response):
        site = html.fromstring(response.body_as_unicode())
        titles = site.xpath('//div[@class="content"]/p[@class="row"]')
        print len(titles), 'AAAA'

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一向        
                
              
                            
                2020-12-18 04:32
              
            
            
                                                                       
try this
  next_page_url = response.xpath('//a[@class="button next"]').extract_first()

  if next_page_url is not None:
        yield scrapy.Request(response.urljoin(next_page_url))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复