scrapy crawler to pass multiple item classes to pipeline

问题

Hi i am very new to Python and Scrapy, this is my first code and i cant solve a problem that looks pretty basic.

I have the crawler set to do two things: 1- Find all pagination URLs, visit them and get some data from each page 2- Get all links listed on the results pages, visite them and crawl for each location data

I am taking the decision of each item to parse using rules with callback. I created to classes inside items.py for each parser

The second rule is processing perfect but the first is not being processed and i cant find where is the error.

The error message that i am getting in the terminal running the crawler

    2014-11-24 02:30:39-0200 [apontador] ERROR: Error processing {'city': u'BR-SP-S\xe3o Paulo',
     'coordinates': {'lat': u'-23.56588', 'lng': u'-46.64777'},
    'current_url': 'http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP',
    'datetime': datetime.datetime(2014, 11, 24, 2, 30, 39, 703972),
    'depth': 0,
    'domain': 'apontador.com.br',
     'link_cat': 'ls',
     'loc_cat': u'supermercado',
     'session_id': -1,
     'site_name': u'Apontador',
     'state': u'BR-SP'}
    Traceback (most recent call last):
      File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
        return process_chain(self.methods[methodname], obj, *args)
      File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
        d.callback(input)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
        self._startRunCallbacks(result)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
        self._runCallbacks()
    --- <exception caught here> ---
      File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
        current.result = callback(current.result, *args, **kw)
      File "/locman/scrapy/locman/pipelines.py", line 37, in process_item
        'neighborhood': item['neighborhood'],
    File "/usr/local/lib/python2.7/dist-packages/scrapy/item.py", line 50, in __getitem__
        return self._values[key]
    exceptions.KeyError: 'neighborhood'

Looking at error message looks clear that scrapy is trying to process all the items in items.py, not respecting the defined item class called by each callback.

If you see the file items.py there are two classes: 1- apontadorlsItem, 2- apontadordsItem

The class apontadordsItem has the key 'neighborhood' but the item class apontadorlsItem does not have the key 'neighborhood'. I created this two classes to support two different callback parser functions depending on the xpath rule. I did this because there are two types of pages being crawled with differents sets of information on each. The rules are working fine as i can see on the log files, the crawler is working, the problem is on processing/saving it!

How can i declare to pipeline to use different item matching rule depending on the source items.py class that was used by the crawler.

Please help, i got stuck

Spider file - spiders/apontador.py

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from datetime import datetime
from tld import get_tld
from locman.items import apontadorlsItem
from locman.items import apontadordsItem

class apontador(CrawlSpider):
    name = 'apontador'
    session_id = -1
    start_urls = ["http://www.apontador.com.br/local/search.html?q=supermercado&loc_z=S%C3%A3o+Paulo%2C+SP&loc=S%C3%A3o+Paulo%2C+SP&loc_y=S%C3%A3o+Paulo%2C+SP"]
    rules = (
            # Rule for LS - Link source - Search results page
            Rule(SgmlLinkExtractor(allow=("", ),restrict_xpaths=("//nav[@class='pagination']") ), callback='parse_items_ls', follow= True),

            # Rule for DS - Data Source - Location data page
            Rule(SgmlLinkExtractor(allow=("", ),restrict_xpaths=(
                "//article[@class='poi card highlight']",
                "//li[@class='similar-place sponsored']",
                "//div[@class='recomendations']",
                "//ul[@class='similar-places-list']",
                "//article[@class='poi card']") ),
                callback='parse_items_ds',
                follow= True),
    )

    def __init__(self, session_id=-1, *args, **kwargs):
        super(apontador, self).__init__(*args, **kwargs)
        self.session_id = session_id

    def parse_start_url(self, response):
        self.response_url = response.url
        return self.parse_items_ls(response)

    # Callback item type LS
    def parse_items_ls(self, response):
        self.response_url = response.url
        sel = Selector(response)
        items_ls = []
        item_ls = apontadorlsItem()
        item_ls["session_id"] = self.session_id
        item_ls["depth"] = response.meta["depth"]
        item_ls["current_url"] = response.url

    # Get site name in metadata
        meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract()
        item_ls["site_name"] = u''.join(meta_site)

    # Get latitude and longitude in metadata
        meta_latitude = sel.xpath("//meta[@name='apontador:latitude']/@content").extract()
        latitude = ''.join(meta_latitude)

        meta_longitude = sel.xpath("//meta[@name='apontador:longitude']/@content").extract()
        longitude = ''.join(meta_longitude)

    # Convert the coordinates to an array
        coordinates = {"lng": longitude , "lat": latitude}
        item_ls["coordinates"] = coordinates

    # This items gets the strings directly from meta data keywords and creates a list
        meta_keywords_ls = sel.xpath("//meta[@name='keywords']/@content").extract()
        meta_keywords_ls_str = u''.join(meta_keywords_ls)
        meta_keywords_ls_list = meta_keywords_ls_str.split(", ")
        meta_state = meta_keywords_ls_list[6]
        meta_city = meta_keywords_ls_list[5]
        meta_loc_cat = meta_keywords_ls_list[4]

        item_ls["state"] = u"BR-" + meta_state
        item_ls["city"] = u"BR-" + meta_state + "-" + meta_city
        item_ls["loc_cat"] = meta_loc_cat

    # This items gets the domain name using the TLD module
        domain = get_tld(response.url)
        item_ls["domain"] = domain

    # This items gets datetime
        item_ls["datetime"] = datetime.now()

    # This items defines de link category        
        item_ls["link_cat"] = "ls"
        yield item_ls


    # Callback item type DS
    def parse_items_ds(self, response):
        self.response_url = response.url
        sel = Selector(response)
        items_ds = []
        item_ds = apontadordsItem()
        item_ds["session_id"] = self.session_id
        item_ds["depth"] = response.meta["depth"]
        item_ds["current_url"] = response.url

    # Get site name in metadata
        meta_site = sel.xpath("//meta[@property='og:site_name']/@content").extract()
        item_ds["site_name"] = u''.join(meta_site)

    # Get location name in metadata
        meta_loc_name = sel.xpath("//meta[@property='og:title']/@content").extract()
        item_ds["loc_name"] = u''.join(meta_loc_name)

    # Get location source id in metadata
        meta_loc_source_id = sel.xpath("//meta[@name='apontador:place-id']/@content").extract()
        item_ds["loc_source_id"] = ''.join(meta_loc_source_id)

    # Get location street address in metadata
        meta_loc_address = sel.xpath("//meta[@property='business:contact_data:street_address']/@content").extract()
        meta_loc_address_str = u''.join(meta_loc_address)
        meta_loc_address_list = meta_loc_address_str.split(", ")
        meta_loc_address_number = meta_loc_address_list[1]
        meta_loc_address_street = meta_loc_address_list[0]
        item_ds["loc_street"] = meta_loc_address_street 
        item_ds["loc_number"] = meta_loc_address_number 

    # Get latitude and longitude in metadata
        meta_latitude = sel.xpath("//meta[@property='place:location:latitude']/@content").extract()
        latitude = ''.join(meta_latitude)

        meta_longitude = sel.xpath("//meta[@property='place:location:longitude']/@content").extract()
        longitude = ''.join(meta_longitude)

        coordinates = {"lng": longitude , "lat": latitude}
        item_ds["coordinates"] = coordinates

    # This items gets the neighborhood, loc_cat, loc_sub_categoryfrom meta data keywords, creates a list and populates the fields from the list
        meta_keywords_ds = sel.xpath("//meta[@name='keywords']/@content").extract()
        meta_keywords_ds_str = u''.join(meta_keywords_ds)
        meta_keywords_ds_list = meta_keywords_ds_str.split(", ")
        meta_loc_cat = meta_keywords_ds_list[9]
        meta_loc_cat_sub = meta_keywords_ds_list[8]
        meta_neighborhood = meta_keywords_ds_list[5]

        item_ds["loc_cat"] = meta_loc_cat
        item_ds["loc_cat_sub"] = meta_loc_cat_sub
        item_ds["neighborhood"] = meta_neighborhood

    # Region informations
        meta_statec = sel.xpath("//meta[@property='business:contact_data:region']/@content").extract()
        meta_state = u''.join(meta_statec)
        item_ds["state"] = u"BR-" + meta_state

        meta_cityc = sel.xpath("//meta[@property='business:contact_data:locality']/@content").extract()
        meta_city = u''.join(meta_cityc)
        item_ds["city"] = u"BR-" + meta_state + "-" + meta_city

        meta_postal_code = sel.xpath("//meta[@property='business:contact_data:postal_code']/@content").extract()
        item_ds["loc_postal_code"] = ''.join(meta_postal_code)

    # This items gets the domain name using the TLD module
        domain = get_tld(response.url)
        item_ds["domain"] = domain

    # This items gets datetime as an i
        item_ds["datetime"] = datetime.now()

        item_ds["link_cat"] = "ds"
        yield item_ds

Items file - items.py

from scrapy.item import Item, Field

class apontadorlsItem(Item):
    datetime = Field()
    session_id = Field()
    depth = Field()
    link_cat = Field()
    site_name = Field()
    domain = Field()
    current_url = Field()
    city = Field()
    state = Field()
    loc_cat = Field()
    coordinates = Field()

class apontadordsItem(Item):
    datetime = Field()
    session_id = Field()
    depth = Field()
    link_cat = Field()
    site_name = Field()
    domain = Field()
    current_url = Field()
    state = Field()
    city = Field()
    neighborhood = Field()
    loc_name = Field()
    loc_street = Field()
    loc_number = Field()
    loc_postal_code = Field()
    loc_source_id = Field()
    loc_cat = Field()
    loc_cat_sub = Field()
    coordinates = Field()

Pipelines file - pipelines.py

from scrapy.exceptions import DropItem
from scrapy_mongodb import MongoDBPipeline

class apontadorpipe(MongoDBPipeline):

    def process_item(self, item, spider):
        if self.config['buffer']:
            self.current_item += 1
            item = dict(item)

            self.item_buffer.append(item)

            if self.current_item == self.config['buffer']:
                self.current_item = 0
                return self.insert_item(self.item_buffer, spider)
            else:
                return item

        matching_item = self.collection.find_one(
            {'datetime': item['datetime'],
             'session_id': item['session_id'],
             'depth': item['depth'],
             'link_cat': item['link_cat'],
             'site_name': item['site_name'],
             'domain': item['domain'],
             'current_url': item['current_url'],
             'state': item['state'],
             'city': item['city'],
             'neighborhood': item['neighborhood'],
             'loc_name': item['loc_name'],
             'loc_street': item['loc_street'],
             'loc_number': item['loc_number'],
             'loc_postal_code': item['loc_postal_code'],
             'loc_cat': item['loc_cat'],
             'loc_cat_sub': item['loc_cat_sub'],
             'loc_source_id': item['loc_source_id'],
             'coordinates': item['coordinates']}
        )

        if matching_item is not None:
            raise DropItem(
                "Duplicate found for %s, %s" %
                item['current_url']
            )
        else:
            return self.insert_item(item, spider)

Settings file - settings.py

BOT_NAME = 'locman'

SPIDER_MODULES = 'locman.spiders'
NEWSPIDER_MODULE = 'locman.spiders'
DEPTH_LIMIT = 10000

DEFAULT_ITEM_CLASS = 'locman.items.apontador'

ITEM_PIPELINES = {
    'locman.pipelines.apontadorpipe': 100
}

# 'scrapy_mongodb.MongoDBPipeline' connection
MONGODB_URI = 'connection string'
MONGODB_DATABASE = ''
MONGODB_COLLECTION = ''

DOWNLOADER_MIDDLEWARES = {
        'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
        'locman.ua.rotate_useragent.RotateUserAgentMiddleware' :400
    }

回答1:

It looks like that item does not have key "neighborhood". make sure following things.

you have not misspelled "neighborhood"
"neighborhood" is defines in item class
item['neighborhood'] is initialized in spider

Make sure that item has key "neighborhood" in File "/locman/scrapy/locman/pipelines.py", line 37, in process_item

    if item.get('neighborhood', None):

it will return None if item has not key "neighborhood", you can also set default value instead of None like this

    if item.get('neighborhood', 'default_value')

回答2:

thanks a lot for the help! I found a nice workround for my problem and it is exactly what i needed!

In the pipeline.py i imported the two classes from items.py, defined 2 differents functions and dict for each. This way i can have different duplicate record treatment and different writing processes to the database for each item class!

The new code for pipeline.py:

from scrapy.exceptions import DropItem
from scrapy_mongodb import MongoDBPipeline

from locman.items import apontadorlsItem
from locman.items import apontadordsItem

class apontadorpipe(MongoDBPipeline):

def process_item_ds(self, item, spider):
    if self.config['buffer']:
        self.current_item += 1
        item = dict(apontadordsItem)

        self.item_buffer.append(item)

        if self.current_item == self.config['buffer']:
            self.current_item = 0
            return self.insert_item(self.item_buffer, spider)
        else:
            return item

        if isinstance(item, apontadordsItem):
            matching_item = self.collection.find_one(
                {'datetime': item['datetime'],
                'session_id': item['session_id'],
                'link_cat': item['link_cat'],
                'site_name': item['site_name'].encode('utf-8'),
                'domain': item['domain'],
                'current_url': item['current_url'],
                'state': item['state'],
                'city': item['city'].encode('utf-8'),
                'neighborhood': item['neighborhood'].encode('utf-8'),
                'loc_name': item['loc_name'].encode('utf-8'),
                'loc_street': item['loc_street'].encode('utf-8'),
                'loc_number': item['loc_number'],
                'loc_postal_code': item['loc_postal_code'],
                'loc_cat': item['loc_cat'],
                'loc_cat_sub': item['loc_cat_sub'],
                'loc_source_id': item['loc_source_id'],
                'loc_phone': item['loc_phone'],
                'address': item['address'].encode('utf-8'),
                'coordinates': item['coordinates']}
            )

            if matching_item is not None:
                raise DropItem(
                    "Duplicate found for %s, %s" %
                    item['current_url'],
                    item['loc_source_id'],
                )

            else:

                return self.insert_item(item, spider)


def process_item_ls(self, item, spider):
    if self.config['buffer']:
        self.current_item += 1
        item = dict(apontadorlsItem)

        self.item_buffer.append(item)

        if self.current_item == self.config['buffer']:
            self.current_item = 0
            return self.insert_item(self.item_buffer, spider)
        else:
            return item

        if isinstance(item, apontadorlsItem):
            matching_item = self.collection.find_one(
                {'datetime': item['datetime'],
                'session_id': item['session_id'],
                'link_cat': item['link_cat'],
                'site_name': item['site_name'].encode('utf-8'),
                'domain': item['domain'],
                'current_url': item['current_url'],
                'state': item['state'],
                'city': item['city'].encode('utf-8'),
                'loc_cat': item['loc_cat'].encode('utf-8'),
                'coordinates': item['coordinates']}
            )

            if matching_item is not None:
                raise DropItem(
                    "Duplicate found for %s, %s" %
                    item['current_url'],
                )

            else:

                return self.insert_item(item, spider)

来源：https://stackoverflow.com/questions/27098827/scrapy-crawler-to-pass-multiple-item-classes-to-pipeline

标签

web-scraping

scrapy

web-crawler

scrapy-spider