1、spider目录下爬虫项目(duo_te_rui.py)
-- coding: utf-8 --
import scrapy
import re
from Duo_te_rui.items import DuoTeRuiItem
class DuoTeRuiSpider(scrapy.Spider):
# 爬虫项目名
name = ‘duo_te_rui’
# 爬虫允许的域
allowed_domains = [‘doterra.cn’]
# 爬虫的起始URL
start_urls = []
# 替换所有的HTML标签 def re_html(self, data): # 替换抓取数据中的html标签 try: message = str(data) re_h = re.compile('</?\w+[^>]*>') # html标签 ret1 = re_h.sub('', message) ret2 = re.sub(r'\n', '', ret1) ret3 = re.sub(r'\u3000', '', ret2) return ret3 except: pass def start_requests(self): for i in range(1, 4): yield scrapy.Request(url='http://doterra.cn/product/single-oils.html?p={}'.format(i), callback=self.parse) def parse(self, response): # 精油列表 item_list = response.xpath('//div[@class="main"]/div[2]/div[2]/ul/li') # print(len(item_list)) # 创建item实例 item = DuoTeRuiItem() # 遍历精油列表,获取单个物品数据 for data in item_list: item['精油名'] = data.xpath('./div/h2/a/text()').extract_first() item['图片链接'] = data.xpath('./a/img/@src').extract_first() item['精油规格'] = data.xpath('./div/div[2]/span/text()').extract_first() print(item) # # 返回数据给引擎 # yield item pass
2、将抓取数据导入数据库(新建一个目录templates(Duo_Te_rui.py))
import json
import requests
import sys
import time
import chardet
创建实例,并连接test库
from sqlalchemy import create_engine
from sqlalchemy import Column
from sqlalchemy import Integer
from sqlalchemy import String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
engine = create_engine(“mysql+pymysql://root:mysql@192.168.75.128/date?charset=utf8”,
encoding=‘utf-8’, echo=True)
engine = create_engine(‘mysql+pymysql://root:Ecmoho@2018@192.168.1.119/yi_heng?charset=utf8’,
encoding=‘utf-8’, echo=True)
创建与数据库的会话session class ,注意,这里返回给session的是个class,不是实例
session_class = sessionmaker(bind=engine) # 实例和engine绑定
session = session_class() # 生成session实例,相当于游标
Base = declarative_base() # 生成orm基类
class parseData(Base):
tablename = ‘jing_you_data’ # 表名
id = Column(Integer, primary_key=True) jing_you_name = Column(String(50)) image_url = Column(String(250)) gui_ge = Column(String(25)) def __init__(self): # 创建表结构(这里是父类调子类) Base.metadata.create_all(engine) # 保存到mysql数据库中去 def insert(self): try: session.add(self) # 把要创建的数据对象添加到这个session里, 一会统一创建 session.commit() # 现此才统一提交,创建数据 except Exception as e: print(e) session.rollback() def to_mysql(self, item): data = item self.jing_you_name = data['精油名'] self.image_url = data['图片链接'] self.gui_ge = data['精油规格'] self.insert() pass
3、在pipelines里面将数据传入templates目录下的Duo_Te_rui.py,将数据导入mysql数据库
-- coding: utf-8 --
Define your item pipelines here
Don’t forget to add your pipeline to the ITEM_PIPELINES setting
See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from Duo_te_rui.templates.Duo_Te_rui import *
class DuoTeRuiPipeline(object):
def process_item(self, item, spider):
parse = parseData()
parse.to_mysql(item)
return item