问题
I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the command line with "scrapy crawl svu -o svu.csv".
The code below successfully pulls episode information, but the CSV does not contain the cast list. How do I fix the code to extract and export both the episode information and the cast list?
My thoughts & attempts:
- I believe that the cast list is extracted because it is visible in the terminal when the spider runs, so it may be an export issue.
- If I comment out my first Yield statement (episode information), the cast list is successfully exported. This makes me think it isn't just an export issue.
Thanks for the help!
import scrapy
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
yield {
'season': response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
'episode': response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
'episode_name': response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
'date_published': response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
'rating_value': response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
'rating_count': response.xpath("//span[@itemprop='ratingCount']/text()").extract()
}
# Follow link to full cast list
for a in response.xpath("//div[@class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[@class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[@class='cast_list']"):
yield {
'actor': response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
'character': response.xpath("//td[@class='character']/a/text()").extract()
}
回答1:
I added changes to your code. Addition I show you how to use Items and Pipelines.
spiders/svu.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
item = EpisodeItem(
season=response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
episode=response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
episode_name=response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
date_published=response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
rating_value=response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
rating_count=response.xpath("//span[@itemprop='ratingCount']/text()").extract()
)
yield item
# Follow link to full cast list
for a in response.xpath("//div[@class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[@class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[@class='cast_list']"):
character = response.xpath("//td[@class='character']/a/text()").extract()
character.extend(response.xpath("//td[@class='character']/text()").extract())
character = [c.strip().replace('\n ', '') for c in character if c.strip()]
item = CastItem(
actor=response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
character=character
)
yield item
items.py
from scrapy import Item, Field
class EpisodeItem(Item):
season = Field()
episode = Field()
episode_name = Field()
date_published = Field()
rating_value = Field()
rating_count = Field()
class CastItem(Item):
actor = Field()
character = Field()
pipelines.py
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from .items import CastItem, EpisodeItem
class IMDBPipeline(object):
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
item_names = ['episode', 'cast']
self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
for exporter in self.exporters.values():
exporter.start_exporting()
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files.values():
file.close()
def process_item(self, item, spider):
if isinstance(item, EpisodeItem):
self.exporters['episode'].export_item(item)
if isinstance(item, CastItem):
self.exporters['cast'].export_item(item)
return item
Add to settings file:
ITEM_PIPELINES = {
'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}
Be carefull. You need to replace PROJECT_NAME to yours.
来源:https://stackoverflow.com/questions/50668209/why-does-my-scrapy-spider-only-scrape-some-of-my-data