问题
I'm new to Scrapy and I'm really just lost on how i can return multiple items in one block.
Basically, I'm getting one HTML tag which has a quote that contains nested tags of text, author name, and some tags about that quote.
The code here only returns one quote and that's it. It doesnt use the loop to return the rest. I've been searching the web for hours and I'm just hopeless I don't get it. Here's my code so far:
Spider.py
import scrapy
from scrapy.loader import ItemLoader
from first_spider.items import FirstSpiderItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
l = ItemLoader(item = FirstSpiderItem(), response=response)
quotes = response.xpath("//*[@class='quote']")
for quote in quotes:
text = quote.xpath(".//span[@class='text']/text()").extract_first()
author = quote.xpath(".//small[@class='author']/text()").extract_first()
tags = quote.xpath(".//meta[@class='keywords']/@content").extract_first()
# removes quotation marks from the text
for c in ['“', '”']:
if c in text:
text = text.replace(c, "")
l.add_value('text', text)
l.add_value('author', author)
l.add_value('tags', tags)
return l.load_item()
next_page_path =
response.xpath(".//li[@class='next']/a/@href").extract_first()
next_page_url = response.urljoin(next_page_path)
yield scrapy.Request(next_page_url)
Items.py
import scrapy
class FirstSpiderItem(scrapy.Item):
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
Here's the page I'm trying to scrape:
Link
回答1:
I was also searching for a solution for the same problem. And here is the solution that I have found:
def parse(self, response):
for selector in response.xpath("//*[@class='quote']"):
l = ItemLoader(item=FirstSpiderItem(), selector=selector)
l.add_xpath('text', './/span[@class="text"]/text()')
l.add_xpath('author', '//small[@class="author"]/text()')
l.add_xpath('tags', './/meta[@class="keywords"]/@content')
yield l.load_item()
next_page = response.xpath(".//li[@class='next']/a/@href").extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
To remove quotation marks from the text, you can use an output processor in items.py.
from scrapy.loader.processors import MapCompose
def replace_quotes(text):
for c in ['“', '”']:
if c in text:
text = text.replace(c, "")
return text
class FirstSpiderItem(scrapy.Item):
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field(output_processor=MapCompose(replace_quotes))
Please let me know whether it was helpful.
回答2:
Give this a try. It will give you all the data that you wanted to scrape.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
for quote in response.xpath("//*[@class='quote']"):
text = quote.xpath(".//span[@class='text']/text()").extract_first()
author = quote.xpath(".//small[@class='author']/text()").extract_first()
tags = quote.xpath(".//meta[@class='keywords']/@content").extract_first()
yield {"Text":text,"Author":author,"Tags":tags}
next_page = response.xpath(".//li[@class='next']/a/@href").extract_first()
if next_page:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url)
来源:https://stackoverflow.com/questions/46571543/scrapy-return-multiple-items