scrapy+selenium 爬取淘宝商城商品数据存入到mongo中

匿名 (未验证) 提交于 2019-12-02 22:56:40

1.配置信息

# 设置mongo参数 MONGO_URI = localhost MONGO_DB = taobao  # 设置搜索关键字 KEYWORDS=[小米手机,华为手机] # 最大爬取页数 MAX_PAGE = 2 # 相应超时设置 SELENIUM_TIMEOUT = 20  ROBOTSTXT_OBEY = False #忽略  # 中间件 DOWNLOADER_MIDDLEWARES = {    taobaoSpider.middlewares.SeleniumMiddleware: 300, }   #项目管道 ITEM_PIPELINES = {    # ‘taobaoSpider.pipelines.TaobaospiderPipeline‘: 300,     taobaoSpider.pipelines.MongoPipeline: 400, }
2.item
import scrapy   class TaobaospiderItem(scrapy.Item):     # define the fields for your item here like:     # name = scrapy.Field()     title = scrapy.Field()     price = scrapy.Field()     sales = scrapy.Field()     shop = scrapy.Field()     location = scrapy.Field()     image = scrapy.Field()

# -*- coding: utf-8 -*- import scrapy  class TaobaoSpider(scrapy.Spider):     name = taobao     allowed_domains = [taobao.com]     base_url = https://s.taobao.com/search?q=      def start_url(self):         # 通过self.settings.get()的方式获取setting里面的参数         for keyword in self.gettings.get(KEYWORDS):             for page in range(1,self.gettings.get(MAX_PAGE)+1):                 url = self.base_url.format(self.url)                 yield scrapy.Request(url=self.url,                                      callback=self.parse,                                     meta={page:page}, # 传递页码                                     dont_filter=True) # 不去重       def parse(self, response):          products = response.xpath(//*[@id="mainsrp-itemlist"]/div[@class="m-itemlist"]/div[@class="grid g-claerfix"]/div[1])          # products = response.xpath(‘//div[contains(@class,"item J_MouserOnverReq"/‘)         for product in products:             from taobaoSpider.taobaoSpider.items import TaobaospiderItem             item = TaobaospiderItem()             item[title] = ‘‘.join(product.xpath(//div[contains(@class,"title")]/text()).extract()).strip()             item[location] = ‘‘.join(product.xpath(//div[contains(@class,"location")]/text()).extract()).strip()             item[shop] = ‘‘.join(product.xpath(//div[contains(@class,"shop")]/text()).extract()).strip()             item[price] = ‘‘.join(product.xpath(//div[contains(@class,"price")]/text()).extract()).strip()             item[deal] = ‘‘.join(product.xpath(//div[contains(@class,"deal-cnt")]/text()).extract()).strip()             item[iamge] = ‘‘.join(product.xpath(//div[@class="pic"]/img[contains(@class,"img")/@data-src).extract()).strip()             yield item

4.中间件

from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from scrapy.http import HtmlResponse from logging import getLogger   class SeleniumMiddleware(object):     # def __init__(self, timeout=None, service_args=[]):     def __init__(self, timeout=None):         self.logger = getLogger(__name__)         self.timeout = timeout         # self.browser = webdriver.PhantomJS(service_args=service_args)         # 无界面模式         # self.options = webdriver.ChromeOptions()         # self.options.add_argument(‘--headless‘)         # self.browser = webdriver.Chrome(chrome_options=self.options)         self.browser = webdriver.Chrome()         # self.browser.set_window_size(1400, 700)         self.browser.set_page_load_timeout(self.timeout)         self.wait = WebDriverWait(self.browser, self.timeout)         print(timeout:, self.timeout)      def __del__(self):         self.browser.close()      def process_request(self, request, spider):         ‘‘‘         :param request:         :param spider:         :return:         ‘‘‘         self.logger.debug(Selenium is Runing)         # 得到的是个int型的整数         page = request.meta.get(page, 1)         try:             self.browser.get(request.url)             print(10*-, request.url,10*-)             if page > 1:                 # 从第二页开始,等待页面加载完成                 # Presence_of_all_elements_located 判断一组元素是否存在                 input = self.wait.until(                     EC.presence_of_element_located((By.CSS_SELECTOR, ##mainsrp-pager > div > div > div > div.form > input)))                 # Element_to_be_clickable 判断元素是否可点击                 submit = self.wait.until(                     EC.element_to_be_clickable((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit)))                 input.clear()                 # 输入第几页                 input.send_keys(page)                 submit.click()             # Text_to_be_present_in_element 判断元素是否有xx文本信息             self.wait.until(EC.text_to_be_present_in_element((                 By.CSS_SELECTOR, #mainsrp-pager > div > div > div > ul > li.item.active > span), str(page)))             # Presence_of_all_elements_located 判断一组元素是否存在             # 检测每一个item是否加载出来了             self.wait.until(EC.presence_of_element_located((                 By.CSS_SELECTOR, #mainsrp-itemlist .m-itemlist .grid.g-clearfix .item)))             return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding=utf-8, status=200)         except TimeoutException:             return HtmlResponse(url=request.url, status=500, request=request)      # 类方法 感觉就是获取setting里面的数据 在这里调用     # 得到数据之后变成类变量     @classmethod     def from_crawler(cls, crawler):         return cls(timeout=crawler.settings.get(SELENIUM_TIMEOUT),)                    # service_args=crawler.settings.get(‘PHANTOMJS_SERVICE_ARGS‘))

5.管道(存储到mongo中)

class Pipeline(object):     def process_item(self, item, spider):         return item  import pymongo  # 存储到mongo中 class MongoPipeline(object):      # 配置mongo数据库     def __init__(self,mongo_url,mongo_db):         self.mongo_url = mongo_url         self.mongo_db = mongo_db      # 从setting中获取参数     @classmethod     def from_crawler(cls,crawler):         mongo_url = crawler.settings.get(MONGO_URL)         mongo_db = crawler.settings.get(MONGO_DB)      # 连接数据库     def open_spider(self,spider):         self.client = pymongo.MongoClient(self.mongo_url)         self.db = self.client[self.mongo_db]      # 关闭数据库连接     def close_spider(self,spider):         self.client.close()      # 设置存储格式     def process_item(self,item,spider):         # item.__class__.__name__  输出的item的类名         name = item.__class__.__name__         print(---------------name, name, -------------------)         self.db[name].insert(dict(item))         return item

原文:https://www.cnblogs.com/ray-mmss/p/9388390.html

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!