Is there a way to trigger a method in a Spider class just before it terminates?
I can terminate the spider myself, like this:
class MySpider(CrawlS
For Scrapy version 1.0.0+ (it may also work for older versions).
from scrapy import signals
class MySpider(CrawlSpider):
name = 'myspider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
print('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
print('Closing {} spider'.format(spider.name))
One good usage is to add tqdm progress bar to scrapy spider.
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tqdm import tqdm
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['somedomain.comm']
start_urls = ['http://www.somedomain.comm/ccid.php']
rules = (
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccds.php\?id=.*'),
callback='parse_item',
),
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccid.php$',
restrict_xpaths='//table/tr[contains(., "SMTH")]'), follow=True),
)
def parse_item(self, response):
self.pbar.update() # update progress bar by 1
item = MyItem()
# parse response
return item
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
self.pbar = tqdm() # initialize progress bar
self.pbar.clear()
self.pbar.write('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
self.pbar.clear()
self.pbar.write('Closing {} spider'.format(spider.name))
self.pbar.close() # close progress bar
It looks like you can register a signal listener through dispatcher
.
I would try something like:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
In the newer version of scrapy scrapy.xlib.pydispatch
is deprecated. instead you can use from pydispatch import dispatcher
.
For me the accepted did not work / is outdated at least for scrapy 0.19. I got it to work with the following though:
from scrapy.signalmanager import SignalManager
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
SignalManager(dispatcher.Any).connect(
self.closed_handler, signal=signals.spider_closed)
def closed_handler(self, spider):
# do stuff here
Just to update, you can just call closed function like this:
class MySpider(CrawlSpider):
def closed(self, reason):
do-something()
if you have many spiders and want to do something before each of them closing, maybe it will be convenient to add statscollector in your project.
in settings:
STATS_CLASS = 'scraper.stats.MyStatsCollector'
and collector:
from scrapy.statscollectors import StatsCollector
class MyStatsCollector(StatsCollector):
def _persist_stats(self, stats, spider):
do something here
For the latest version(v1.7), just define closed(reason)
method in your spider class.
closed(reason)
:Called when the spider closes. This method provides a shortcut to signals.connect() for the spider_closed signal.
Scrapy Doc : scrapy.spiders.Spider.closed