Scrapy put two spiders in single file

前端 未结 2 1731
滥情空心
滥情空心 2021-01-14 05:51

I have written two spiders in single file. When I ran scrapy runspider two_spiders.py, only the first Spider was executed. How can I run both of them without sp

2条回答
  •  星月不相逢
    2021-01-14 06:18

    Here is a full Scrapy project with 2 spiders in one file.

    # quote_spiders.py
    import json
    import string
    
    import scrapy
    from scrapy.crawler import CrawlerProcess
    from scrapy.item import Item, Field
    
    
    class TextCleaningPipeline(object):
    
        def _clean_text(self, text):
            text = text.replace('“', '').replace('”', '')
            table = str.maketrans({key: None for key in string.punctuation})
            clean_text = text.translate(table)
            return clean_text.lower()
    
        def process_item(self, item, spider):
            item['text'] = self._clean_text(item['text'])
            return item
    
    
    class JsonWriterPipeline(object):
    
        def open_spider(self, spider):
            self.file = open(spider.settings['JSON_FILE'], 'a')
    
        def close_spider(self, spider):
            self.file.close()
    
        def process_item(self, item, spider):
            line = json.dumps(dict(item)) + "\n"
            self.file.write(line)
            return item
    
    
    class QuoteItem(Item):
        text = Field()
        author = Field()
        tags = Field()
        spider = Field()
    
    
    class QuotesSpiderOne(scrapy.Spider):
        name = "quotes1"
    
        def start_requests(self):
            urls = ['http://quotes.toscrape.com/page/1/', ]
            for url in urls:
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            for quote in response.css('div.quote'):
                item = QuoteItem()
                item['text'] = quote.css('span.text::text').get()
                item['author'] = quote.css('small.author::text').get()
                item['tags'] = quote.css('div.tags a.tag::text').getall()
                item['spider'] = self.name
                yield item
    
    
    class QuotesSpiderTwo(scrapy.Spider):
        name = "quotes2"
    
        def start_requests(self):
            urls = ['http://quotes.toscrape.com/page/2/', ]
            for url in urls:
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            for quote in response.css('div.quote'):
                item = QuoteItem()
                item['text'] = quote.css('span.text::text').get()
                item['author'] = quote.css('small.author::text').get()
                item['tags'] = quote.css('div.tags a.tag::text').getall()
                item['spider'] = self.name
                yield item
    
    
    if __name__ == '__main__':
        settings = dict()
        settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        settings['HTTPCACHE_ENABLED'] = True
        settings['JSON_FILE'] = 'items.jl'
        settings['ITEM_PIPELINES'] = dict()
        settings['ITEM_PIPELINES']['__main__.TextCleaningPipeline'] = 800
        settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 801
    
        process = CrawlerProcess(settings=settings)
        process.crawl(QuotesSpiderOne)
        process.crawl(QuotesSpiderTwo)
        process.start()
    
    

    Install Scrapy and run the script

    $ pip install Scrapy
    $ python quote_spiders.py 
    

    No other file is needed.

    This example coupled with graphical debugger of pycharm/vscode can help understand scrapy workflow and make debugging easier.

提交回复
热议问题