Creating Scrapy array of items with multiple parse

问题

I am scraping listings with Scrapy. My script parses first for the listing urls using parse_node, then it parses each listing using parse_listing, for each listing it parses the agents for the listing using parse_agent. I would like to create an array, that builds up as scrapy parses through the listings and the agents for the listings and resets for each new listing.

Here is my parsing script:

 def parse_node(self,response,node):
  yield Request('LISTING LINK',callback=self.parse_listing)
 def parse_listing(self,response):
  yield response.xpath('//node[@id="ListingId"]/text()').extract_first()
  yield response.xpath('//node[@id="ListingTitle"]/text()').extract_first()
  for agent in string.split(response.xpath('//node[@id="Agents"]/text()').extract_first() or "",'^'):
   yield Request('AGENT LINK',callback=self.parse_agent)
 def parse_agent(self,response):
  yield response.xpath('//node[@id="AgentName"]/text()').extract_first()
  yield response.xpath('//node[@id="AgentEmail"]/text()').extract_first()

I would like parse_listing to result in:

{
 'id':123,
 'title':'Amazing Listing'
}

then parse_agent to add to the listing array:

{
 'id':123,
 'title':'Amazing Listing'
 'agent':[
  {
   'name':'jon doe',
   'email:'jon.doe@email.com'
  },
  {
   'name':'jane doe',
   'email:'jane.doe@email.com'
  }
 ]
}

How do I get the results from each level and build up an array?

回答1:

This is somewhat complicated issued:
You need to form a single item from multiple different urls.

Scrapy allows you to carry over data in request's meta attribute so you can do something like:

def parse_node(self,response,node):
    yield Request('LISTING LINK', callback=self.parse_listing)

def parse_listing(self,response):
    item = defaultdict(list)
    item['id'] = response.xpath('//node[@id="ListingId"]/text()').extract_first()
    item['title'] = response.xpath('//node[@id="ListingTitle"]/text()').extract_first()
    agent_urls = string.split(response.xpath('//node[@id="Agents"]/text()').extract_first() or "",'^')
    # find all agent urls and start with first one
    url = agent_urls.pop(0)
    # we want to go through agent urls one-by-one and update single item with agent data
    yield Request(url, callback=self.parse_agent, 
                  meta={'item': item, 'agent_urls' agent_urls})

def parse_agent(self,response):
    item = response.meta['item']  # retrieve item generated in previous request
    agent = dict() 
    agent['name'] = response.xpath('//node[@id="AgentName"]/text()').extract_first()
    agent['email'] =  response.xpath('//node[@id="AgentEmail"]/text()').extract_first()
    item['agents'].append(agent)
    # check if we have any more agent urls left
    agent_urls = response.meta['agent_urls']
    if not agent_urls:  # we crawled all of the agents!
        return item
    # if we do - crawl next agent and carry over our current item
    url = agent_urls.pop(0)
    yield Request(url, callback=self.parse_agent, 
                  meta={'item': item, 'agent_urls' agent_urls})

回答2:

import requests from scrapy create a hash and an agents list and append that list with the data from the requests.

from scrapy import requests

listing = { "title" : "amazing listing", "agents" : [ ] }

agentUrls = ["list", "of", "urls", "from", "scraped", "page"]

for agentUrl in agentUrls:
    agentPage = requests.get(agentUrl)
    agentTree = html.fromstring(page.content)
    name = agentTree.xpath('//node[@id="AgentName"]/text()').extract_first()
    email = agentTree.xpath('//node[@id="AgentEmail"]/text()').extract_first()
    agent = { "name" : name, "email": email }
    listings.agents.append(agent)

来源：https://stackoverflow.com/questions/45496764/creating-scrapy-array-of-items-with-multiple-parse

标签

python

arrays

scrapy

scrapy-spider