问题
I just tried running multiple spiders in the same process using the new scrapy documentation but I am getting: AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
I found this SO post with the same problem so I tried using the code from the 0.24 documentation and got: runspider: error: Unable to load 'price_comparator.py': No module named testspiders.spiders.followall
For 1.0 I imported:
from scrapy.crawler import CrawlerProcess
and for 0.24 I imported:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log
from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
Based on the 0.24 doc, it seems like the code runs one spider through multiple domains, which doesn't seem to be the same as what the 1.0 doc's code does, but I ran both anyway. I do have the code to run both spiders inside of the file where my spiders are defined so that could be a problem. Is there some internal issue with the new version of the code or is there some dependency or code that I'm missing from my program? I have the file with the code from both docs below (I didn't run both versions at the same time).
Spider Class Definitions
relevant code at the bottom
import re
import json
import scrapy
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem2
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess
#from scrapy.crawler import Crawler
from scrapy import log
#from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
from urlparse import urljoin
bandname = raw_input("Enter a bandname \n")
#location = raw_input("Enter a city \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
sg_url = "http://www.seatgeek.com/" + bandname + "-tickets"
#sh_url = "http://www.stubhub.com/" + bandname + "-tickets/"
#print sh_url
#rules = (Rule(LinkExtractor(allow=("concerts/" + bandname + "-tickets/" + bandname + "-" + item["ticketsLink"]),restrict_xpaths=('.//*/td[3]/a/@href',))callback = "parse_tickets" , follow = True,))
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
ticketPrice = ''.join(price_list[0])
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider2(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator2'
allowed_domains = ["www.seatgeek.com/"]
start_urls = [sg_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json2(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
listings_info = jsonresponse.get('listings')
price_list = [i.get('pf') for i in ticket_info]
ticketPrice = price_list[0]
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price2(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id= ticketsLink.split('/')[6]
json_url = "https://seatgeek.com/listings?client_id=MTY2MnwxMzgzMzIwMTU4&id=" + json_id + "&_wt=1&&_=1436364489501"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse2(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()')
loader.add_xpath('eventLocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , '//a[@class = "event-listing-button"]/@href')
loader.add_xpath('eventDate' , '//div[@class = "event-listing-date"]/text()')
loader.add_xpath('eventCity' , './/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventCountry' , './/span[@itemprop = "addressCountry"]/text()')
loader.add_xpath('eventTime' , '//div[@class = "event-listing-time"]/text()')
#ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketsLink")
#ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = True)
#0.24 code
# def setup_crawler(domain):
# spider = FollowAllSpider(domain=domain)
# settings = get_project_settings()
# crawler = Crawler(settings)
# crawler.configure()
# crawler.crawl(spider)
# crawler.start()
# for domain in [vs_url, sg_url]:
# setup_crawler(domain)
# log.start()
# reactor.run()
#1.0 code
process = CrawlerProcess(get_project_settings())
process = CrawlerProcess({
'USER_AGENT' : 'Mozilla/4.0 (compartible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider)
process.crawl(MySpider2)
process.start()
来源:https://stackoverflow.com/questions/31303197/proper-way-to-run-multiple-scrapy-spiders