Export scrapy items to different files

前端 未结 1 375
执念已碎
执念已碎 2021-01-06 09:41

I\'m scraping review from moocs likes this one

From there I\'m getting all the course details, 5 items and another 6 items from each review itself.

This is

1条回答
  •  醉梦人生
    2021-01-06 10:35

    The issue is you are mixing everything up into a single item, which is not the right way to do it. You should created two items MoocsItem and MoocsReviewItem

    And then update the code like below

    def parse_reviews(self, response):
        #print response.body
        l = ItemLoader(item=MoocsItem(), response=response)
        l.add_xpath('course_title', '//*[@class="course-header-ng__main-info__name__title"]//text()')
        l.add_xpath('course_description', '//*[@class="course-info__description"]//p/text()')
        l.add_xpath('course_instructors', '//*[@class="course-info__instructors__names"]//text()')
        l.add_xpath('course_key_concepts', '//*[@class="key-concepts__labels"]//text()')
        l.add_value('course_link', response.url)
    
        item = l.load_item()
    
        for review in response.xpath('//*[@class="review-body"]'):
            r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
            r.add_value('course_title', item['course_title'])
            r.add_xpath('review_body', './/div[@class="review-body__content"]//text()')
            r.add_xpath('course_stage', './/*[@class="review-body-info__course-stage--completed"]//text()')
            r.add_xpath('user_name', './/*[@class="review-body__username"]//text()')
            r.add_xpath('review_date', './/*[@itemprop="datePublished"]/@datetime')
            r.add_xpath('score', './/*[@class="sr-only"]//text()')
    
            yield r.load_item()
    
        yield item
    

    Now what you want is that different item type goes in different csv files. Which is what the below SO thread answers

    How can scrapy export items to separate csv files per item

    Have not tested the below, but the code will become something like below

    from scrapy.exporters import CsvItemExporter
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    
    
    def item_type(item):
        return type(item).__name__.replace('Item','').lower()  # TeamItem => team
    
    class MultiCSVItemPipeline(object):
        SaveTypes = ['moocs','moocsreview']
    
        def __init__(self):
            dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
            dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
    
        def spider_opened(self, spider):
            self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
            self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
            [e.start_exporting() for e in self.exporters.values()]
    
        def spider_closed(self, spider):
            [e.finish_exporting() for e in self.exporters.values()]
            [f.close() for f in self.files.values()]
    
        def process_item(self, item, spider):
            what = item_type(item)
            if what in set(self.SaveTypes):
                self.exporters[what].export_item(item)
            return item
    

    You need make sure the ITEM_PIPELINES is updated to use this MultiCSVItemPipeline class

    ITEM_PIPELINES = {
        'mybot.pipelines.MultiCSVItemPipeline': 300,
    }
    

    0 讨论(0)
提交回复
热议问题