问题
Hi i am very new to Python and Scrapy, this is my first code and i cant solve a problem that looks pretty basic.
I have the crawler set to do two things: 1- Find all pagination URLs, visit them and get some data from each page 2- Get all links listed on the results pages, visite them and crawl for each location data
I am taking the decision of each item to parse using rules with callback. I created to classes inside items.py for each parser
The second rule is processing perfect but the first is not being processed and i cant find where is the error.
The error message that i am getting in the terminal running the crawler
2014-11-24 02:30:39-0200 [apontador] ERROR: Error processing {'city': u'BR-SP-S\xe3o Paulo',
'coordinates': {'lat': u'-23.56588', 'lng': u'-46.64777'},
'current_url': 'http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP',
'datetime': datetime.datetime(2014, 11, 24, 2, 30, 39, 703972),
'depth': 0,
'domain': 'apontador.com.br',
'link_cat': 'ls',
'loc_cat': u'supermercado',
'session_id': -1,
'site_name': u'Apontador',
'state': u'BR-SP'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
self._startRunCallbacks(result)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/locman/scrapy/locman/pipelines.py", line 37, in process_item
'neighborhood': item['neighborhood'],
File "/usr/local/lib/python2.7/dist-packages/scrapy/item.py", line 50, in __getitem__
return self._values[key]
exceptions.KeyError: 'neighborhood'
Looking at error message looks clear that scrapy is trying to process all the items in items.py, not respecting the defined item class called by each callback.
If you see the file items.py there are two classes: 1- apontadorlsItem, 2- apontadordsItem
The class apontadordsItem has the key 'neighborhood' but the item class apontadorlsItem does not have the key 'neighborhood'. I created this two classes to support two different callback parser functions depending on the xpath rule. I did this because there are two types of pages being crawled with differents sets of information on each. The rules are working fine as i can see on the log files, the crawler is working, the problem is on processing/saving it!
How can i declare to pipeline to use different item matching rule depending on the source items.py class that was used by the crawler.
Please help, i got stuck
Spider file - spiders/apontador.py
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from datetime import datetime
from tld import get_tld
from locman.items import apontadorlsItem
from locman.items import apontadordsItem
class apontador(CrawlSpider):
name = 'apontador'
session_id = -1
start_urls = ["http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP"]
rules = (
# Rule for LS - Link source - Search results page
Rule(SgmlLinkExtractor(allow=("", ),restrict_xpaths=("//nav[@class='pagination']") ), callback='parse_items_ls', follow= True),
# Rule for DS - Data Source - Location data page
Rule(SgmlLinkExtractor(allow=("", ),restrict_xpaths=(
"//article[@class='poi card highlight']",
"//li[@class='similar-place sponsored']",
"//div[@class='recomendations']",
"//ul[@class='similar-places-list']",
"//article[@class='poi card']") ),
callback='parse_items_ds',
follow= True),
)
def __init__(self, session_id=-1, *args, **kwargs):
super(apontador, self).__init__(*args, **kwargs)
self.session_id = session_id
def parse_start_url(self, response):
self.response_url = response.url
return self.parse_items_ls(response)
# Callback item type LS
def parse_items_ls(self, response):
self.response_url = response.url
sel = Selector(response)
items_ls = []
item_ls = apontadorlsItem()
item_ls["session_id"] = self.session_id
item_ls["depth"] = response.meta["depth"]
item_ls["current_url"] = response.url
# Get site name in metadata
meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract()
item_ls["site_name"] = u''.join(meta_site)
# Get latitude and longitude in metadata
meta_latitude = sel.xpath("//meta[@name='apontador:latitude']/@content").extract()
latitude = ''.join(meta_latitude)
meta_longitude = sel.xpath("//meta[@name='apontador:longitude']/@content").extract()
longitude = ''.join(meta_longitude)
# Convert the coordinates to an array
coordinates = {"lng": longitude , "lat": latitude}
item_ls["coordinates"] = coordinates
# This items gets the strings directly from meta data keywords and creates a list
meta_keywords_ls = sel.xpath("//meta[@name='keywords']/@content").extract()
meta_keywords_ls_str = u''.join(meta_keywords_ls)
meta_keywords_ls_list = meta_keywords_ls_str.split(", ")
meta_state = meta_keywords_ls_list[6]
meta_city = meta_keywords_ls_list[5]
meta_loc_cat = meta_keywords_ls_list[4]
item_ls["state"] = u"BR-" + meta_state
item_ls["city"] = u"BR-" + meta_state + "-" + meta_city
item_ls["loc_cat"] = meta_loc_cat
# This items gets the domain name using the TLD module
domain = get_tld(response.url)
item_ls["domain"] = domain
# This items gets datetime
item_ls["datetime"] = datetime.now()
# This items defines de link category
item_ls["link_cat"] = "ls"
yield item_ls
# Callback item type DS
def parse_items_ds(self, response):
self.response_url = response.url
sel = Selector(response)
items_ds = []
item_ds = apontadordsItem()
item_ds["session_id"] = self.session_id
item_ds["depth"] = response.meta["depth"]
item_ds["current_url"] = response.url
# Get site name in metadata
meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract()
item_ds["site_name"] = u''.join(meta_site)
# Get location name in metadata
meta_loc_name = sel.xpath("//meta[@property='og:title']/@content").extract()
item_ds["loc_name"] = u''.join(meta_loc_name)
# Get location source id in metadata
meta_loc_source_id = sel.xpath("//meta[@name='apontador:place-id']/@content").extract()
item_ds["loc_source_id"] = ''.join(meta_loc_source_id)
# Get location street address in metadata
meta_loc_address = sel.xpath("//meta[@property='business:contact_data:street_address']/@content").extract()
meta_loc_address_str = u''.join(meta_loc_address)
meta_loc_address_list = meta_loc_address_str.split(", ")
meta_loc_address_number = meta_loc_address_list[1]
meta_loc_address_street = meta_loc_address_list[0]
item_ds["loc_street"] = meta_loc_address_street
item_ds["loc_number"] = meta_loc_address_number
# Get latitude and longitude in metadata
meta_latitude = sel.xpath("//meta[@property='place:location:latitude']/@content").extract()
latitude = ''.join(meta_latitude)
meta_longitude = sel.xpath("//meta[@property='place:location:longitude']/@content").extract()
longitude = ''.join(meta_longitude)
coordinates = {"lng": longitude , "lat": latitude}
item_ds["coordinates"] = coordinates
# This items gets the neighborhood, loc_cat, loc_sub_categoryfrom meta data keywords, creates a list and populates the fields from the list
meta_keywords_ds = sel.xpath("//meta[@name='keywords']/@content").extract()
meta_keywords_ds_str = u''.join(meta_keywords_ds)
meta_keywords_ds_list = meta_keywords_ds_str.split(", ")
meta_loc_cat = meta_keywords_ds_list[9]
meta_loc_cat_sub = meta_keywords_ds_list[8]
meta_neighborhood = meta_keywords_ds_list[5]
item_ds["loc_cat"] = meta_loc_cat
item_ds["loc_cat_sub"] = meta_loc_cat_sub
item_ds["neighborhood"] = meta_neighborhood
# Region informations
meta_statec = sel.xpath("//meta[@property='business:contact_data:region']/@content").extract()
meta_state = u''.join(meta_statec)
item_ds["state"] = u"BR-" + meta_state
meta_cityc = sel.xpath("//meta[@property='business:contact_data:locality']/@content").extract()
meta_city = u''.join(meta_cityc)
item_ds["city"] = u"BR-" + meta_state + "-" + meta_city
meta_postal_code = sel.xpath("//meta[@property='business:contact_data:postal_code']/@content").extract()
item_ds["loc_postal_code"] = ''.join(meta_postal_code)
# This items gets the domain name using the TLD module
domain = get_tld(response.url)
item_ds["domain"] = domain
# This items gets datetime as an i
item_ds["datetime"] = datetime.now()
item_ds["link_cat"] = "ds"
yield item_ds
Items file - items.py
from scrapy.item import Item, Field
class apontadorlsItem(Item):
datetime = Field()
session_id = Field()
depth = Field()
link_cat = Field()
site_name = Field()
domain = Field()
current_url = Field()
city = Field()
state = Field()
loc_cat = Field()
coordinates = Field()
class apontadordsItem(Item):
datetime = Field()
session_id = Field()
depth = Field()
link_cat = Field()
site_name = Field()
domain = Field()
current_url = Field()
state = Field()
city = Field()
neighborhood = Field()
loc_name = Field()
loc_street = Field()
loc_number = Field()
loc_postal_code = Field()
loc_source_id = Field()
loc_cat = Field()
loc_cat_sub = Field()
coordinates = Field()
Pipelines file - pipelines.py
from scrapy.exceptions import DropItem
from scrapy_mongodb import MongoDBPipeline
class apontadorpipe(MongoDBPipeline):
def process_item(self, item, spider):
if self.config['buffer']:
self.current_item += 1
item = dict(item)
self.item_buffer.append(item)
if self.current_item == self.config['buffer']:
self.current_item = 0
return self.insert_item(self.item_buffer, spider)
else:
return item
matching_item = self.collection.find_one(
{'datetime': item['datetime'],
'session_id': item['session_id'],
'depth': item['depth'],
'link_cat': item['link_cat'],
'site_name': item['site_name'],
'domain': item['domain'],
'current_url': item['current_url'],
'state': item['state'],
'city': item['city'],
'neighborhood': item['neighborhood'],
'loc_name': item['loc_name'],
'loc_street': item['loc_street'],
'loc_number': item['loc_number'],
'loc_postal_code': item['loc_postal_code'],
'loc_cat': item['loc_cat'],
'loc_cat_sub': item['loc_cat_sub'],
'loc_source_id': item['loc_source_id'],
'coordinates': item['coordinates']}
)
if matching_item is not None:
raise DropItem(
"Duplicate found for %s, %s" %
item['current_url']
)
else:
return self.insert_item(item, spider)
Settings file - settings.py
BOT_NAME = 'locman'
SPIDER_MODULES = 'locman.spiders'
NEWSPIDER_MODULE = 'locman.spiders'
DEPTH_LIMIT = 10000
DEFAULT_ITEM_CLASS = 'locman.items.apontador'
ITEM_PIPELINES = {
'locman.pipelines.apontadorpipe': 100
}
# 'scrapy_mongodb.MongoDBPipeline' connection
MONGODB_URI = 'connection string'
MONGODB_DATABASE = ''
MONGODB_COLLECTION = ''
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
'locman.ua.rotate_useragent.RotateUserAgentMiddleware' :400
}
回答1:
It looks like that item does not have key "neighborhood". make sure following things.
- you have not misspelled "neighborhood"
- "neighborhood" is defines in item class
- item['neighborhood'] is initialized in spider
Make sure that item has key "neighborhood" in File "/locman/scrapy/locman/pipelines.py", line 37, in process_item
if item.get('neighborhood', None):
it will return None if item has not key "neighborhood", you can also set default value instead of None like this
if item.get('neighborhood', 'default_value')
回答2:
thanks a lot for the help! I found a nice workround for my problem and it is exactly what i needed!
In the pipeline.py i imported the two classes from items.py, defined 2 differents functions and dict for each. This way i can have different duplicate record treatment and different writing processes to the database for each item class!
The new code for pipeline.py:
from scrapy.exceptions import DropItem
from scrapy_mongodb import MongoDBPipeline
from locman.items import apontadorlsItem
from locman.items import apontadordsItem
class apontadorpipe(MongoDBPipeline):
def process_item_ds(self, item, spider):
if self.config['buffer']:
self.current_item += 1
item = dict(apontadordsItem)
self.item_buffer.append(item)
if self.current_item == self.config['buffer']:
self.current_item = 0
return self.insert_item(self.item_buffer, spider)
else:
return item
if isinstance(item, apontadordsItem):
matching_item = self.collection.find_one(
{'datetime': item['datetime'],
'session_id': item['session_id'],
'link_cat': item['link_cat'],
'site_name': item['site_name'].encode('utf-8'),
'domain': item['domain'],
'current_url': item['current_url'],
'state': item['state'],
'city': item['city'].encode('utf-8'),
'neighborhood': item['neighborhood'].encode('utf-8'),
'loc_name': item['loc_name'].encode('utf-8'),
'loc_street': item['loc_street'].encode('utf-8'),
'loc_number': item['loc_number'],
'loc_postal_code': item['loc_postal_code'],
'loc_cat': item['loc_cat'],
'loc_cat_sub': item['loc_cat_sub'],
'loc_source_id': item['loc_source_id'],
'loc_phone': item['loc_phone'],
'address': item['address'].encode('utf-8'),
'coordinates': item['coordinates']}
)
if matching_item is not None:
raise DropItem(
"Duplicate found for %s, %s" %
item['current_url'],
item['loc_source_id'],
)
else:
return self.insert_item(item, spider)
def process_item_ls(self, item, spider):
if self.config['buffer']:
self.current_item += 1
item = dict(apontadorlsItem)
self.item_buffer.append(item)
if self.current_item == self.config['buffer']:
self.current_item = 0
return self.insert_item(self.item_buffer, spider)
else:
return item
if isinstance(item, apontadorlsItem):
matching_item = self.collection.find_one(
{'datetime': item['datetime'],
'session_id': item['session_id'],
'link_cat': item['link_cat'],
'site_name': item['site_name'].encode('utf-8'),
'domain': item['domain'],
'current_url': item['current_url'],
'state': item['state'],
'city': item['city'].encode('utf-8'),
'loc_cat': item['loc_cat'].encode('utf-8'),
'coordinates': item['coordinates']}
)
if matching_item is not None:
raise DropItem(
"Duplicate found for %s, %s" %
item['current_url'],
)
else:
return self.insert_item(item, spider)
来源:https://stackoverflow.com/questions/27098827/scrapy-crawler-to-pass-multiple-item-classes-to-pipeline