scrapy passing custom_settings to spider from script using CrawlerProcess.crawl()

后端 未结 4 1529
予麋鹿
予麋鹿 2021-01-05 09:19

I am trying to programatically call a spider through a script. I an unable to override the settings through the constructor using CrawlerProcess. Let me illustrate this with

4条回答
  •  借酒劲吻你
    2021-01-05 10:07

    Scrapy Settings are a bit like Python dicts. So you can update the settings object before passing it to CrawlerProcess:

    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    from scrapy.settings import Settings
    
    def main():
    
        s = get_project_settings()
        s.update({
            'FEED_URI': 'quotes.csv',
            'LOG_FILE': 'quotes.log'
        })
        proc = CrawlerProcess(s)
    
        proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
        proc.start()
    

    Edit following OP's comments:

    Here's a variation using CrawlerRunner, with a new CrawlerRunner for each crawl and re-configuring logging at each iteration to write to different files each time:

    import logging
    from twisted.internet import reactor, defer
    
    import scrapy
    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.log import configure_logging, _get_handler
    from scrapy.utils.project import get_project_settings
    
    
    class QuotesSpider(scrapy.Spider):
        name = "quotes"
    
        def start_requests(self):
            page = getattr(self, 'page', 1)
            yield scrapy.Request('http://quotes.toscrape.com/page/{}/'.format(page),
                                 self.parse)
    
        def parse(self, response):
            for quote in response.css('div.quote'):
                yield {
                    'text': quote.css('span.text::text').extract_first(),
                    'author': quote.css('small.author::text').extract_first(),
                    'tags': quote.css('div.tags a.tag::text').extract(),
                }
    
    
    @defer.inlineCallbacks
    def crawl():
        s = get_project_settings()
        for i in range(1, 4):
            s.update({
                'FEED_URI': 'quotes%03d.csv' % i,
                'LOG_FILE': 'quotes%03d.log' % i
            })
    
            # manually configure logging for LOG_FILE
            configure_logging(settings=s, install_root_handler=False)
            logging.root.setLevel(logging.NOTSET)
            handler = _get_handler(s)
            logging.root.addHandler(handler)
    
            runner = CrawlerRunner(s)
            yield runner.crawl(QuotesSpider, page=i)
    
            # reset root handler
            logging.root.removeHandler(handler)
        reactor.stop()
    
    crawl()
    reactor.run() # the script will block here until the last crawl call is finished
    

提交回复
热议问题