Why does my Scrapy spider only scrape some of my data?

问题

I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the command line with "scrapy crawl svu -o svu.csv".

The code below successfully pulls episode information, but the CSV does not contain the cast list. How do I fix the code to extract and export both the episode information and the cast list?

My thoughts & attempts:

I believe that the cast list is extracted because it is visible in the terminal when the spider runs, so it may be an export issue.
If I comment out my first Yield statement (episode information), the cast list is successfully exported. This makes me think it isn't just an export issue.

Thanks for the help!

import scrapy

class SvuSpider(scrapy.Spider):
    name = "svu"
    start_urls = [
        'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
    ]

    def parse(self, response):
        # Gather episode information
        yield {
            'season': response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
            'episode': response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
            'episode_name': response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
            'date_published': response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
            'rating_value': response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
            'rating_count': response.xpath("//span[@itemprop='ratingCount']/text()").extract()
        }

        # Follow link to full cast list
        for a in response.xpath("//div[@class='see-more']/a"):
            yield response.follow(a, callback=self.parse_cast)

        # Follow link to next episode
        for a in response.xpath("//a[@class='bp_item np_next']"):
            yield response.follow(a, callback=self.parse)

    def parse_cast(self,response):
        # Gather cast list data
        for actor in response.xpath("//table[@class='cast_list']"):
            yield {
                'actor': response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
                'character': response.xpath("//td[@class='character']/a/text()").extract()
            }

回答1:

I added changes to your code. Addition I show you how to use Items and Pipelines.

spiders/svu.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem

class SvuSpider(scrapy.Spider):
    name = "svu"
    start_urls = [
        'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
    ]

    def parse(self, response):
        # Gather episode information
        item = EpisodeItem(
            season=response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
            episode=response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
            episode_name=response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
            date_published=response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
            rating_value=response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
            rating_count=response.xpath("//span[@itemprop='ratingCount']/text()").extract()
        )
        yield item

        # Follow link to full cast list
        for a in response.xpath("//div[@class='see-more']/a"):
            yield response.follow(a, callback=self.parse_cast)

        # Follow link to next episode
        for a in response.xpath("//a[@class='bp_item np_next']"):
            yield response.follow(a, callback=self.parse)

    def parse_cast(self,response):
        # Gather cast list data
        for actor in response.xpath("//table[@class='cast_list']"):
            character = response.xpath("//td[@class='character']/a/text()").extract()
            character.extend(response.xpath("//td[@class='character']/text()").extract())
            character = [c.strip().replace('\n ', '') for c in character if c.strip()]
            item = CastItem(
                actor=response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
                character=character
            )
            yield item

items.py

from scrapy import Item, Field

    class EpisodeItem(Item):
        season = Field()
        episode = Field()
        episode_name = Field()
        date_published = Field()
        rating_value = Field()
        rating_count = Field()

    class CastItem(Item):
        actor = Field()
        character = Field()

pipelines.py

from scrapy import signals
from scrapy.exporters import CsvItemExporter

from .items import CastItem, EpisodeItem

class IMDBPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        item_names = ['episode', 'cast']
        self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
        self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
        for exporter in self.exporters.values():
            exporter.start_exporting()

    def spider_closed(self, spider):
        for exporter in self.exporters.values():
            exporter.finish_exporting()

        for file in self.files.values():
            file.close()

    def process_item(self, item, spider):
        if isinstance(item, EpisodeItem):
            self.exporters['episode'].export_item(item)

        if isinstance(item, CastItem):
            self.exporters['cast'].export_item(item)

        return item

Add to settings file:

ITEM_PIPELINES = {
    'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}

Be carefull. You need to replace PROJECT_NAME to yours.

来源：https://stackoverflow.com/questions/50668209/why-does-my-scrapy-spider-only-scrape-some-of-my-data

标签

python

web-scraping

scrapy

scrapy-spider