1.创建scrapy项目
dos窗口输入:
scrapy startproject quote
cd quote
2.编写item.py文件(相当于编写模板,需要爬取的数据在这里定义)
import scrapy
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
3.创建爬虫文件
dos窗口输入:
scrapy genspider myspider quotes.toscrape.com
4.编写myspider.py文件(接收响应,处理数据)
# -*- coding: utf-8 -*-
import scrapy
from quote.items import QuoteItem
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
for each in response.xpath('//div[@class="quote"]'):
item = QuoteItem()
item['text'] = each.xpath('./span/text()').extract()[0]
item['author'] = each.xpath('.//small/text()').extract()[0]
list = each.xpath('.//a[@class="tag"]/text()').extract()
#列表形式的文件不能存入mysql,需要弄成str形式
item['tags']= '/'.join(list)
yield item
next = response.xpath('//li[@class="next"]/a/@href').extract()[0]
url = response.urljoin(next)
yield scrapy.Request(url=url,callback=self.parse)
5.编写pipelines.py(存储数据)
存储到mysql
import pymysql.cursors
class QuotePipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
user='root',
password='',
database='quotes',
charset='utf8',
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
item = dict(item)
sql = 'insert into quote(text,author,tags) values(%s,%s,%s)'
self.cursor.execute(sql,(item['text'],item['author'],item['tags']))
self.connect.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
改进版:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql.cursors
class QuotePipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
user='root',
password='',
database='quotes',
charset='utf8',
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
item = dict(item)
table = 'quote'
keys = ','.join(item.keys())
values = ','.join(['%s']*len(item))
sql = 'insert into {table}({keys}) values({values})'.format(table=table,keys=keys,values=values)
try:
if self.cursor.execute(sql, tuple(item.values())):
self.connect.commit()
print("Successful!")
except:
print("Failed!")
self.connect.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.connect.close()
存储到mongoDB
1.在setting文件设置2个属性
MONGO_URI = 'localhost'
MONGO_DB = 'study'
#一个管道文件
ITEM_PIPELINES = {
# 'quote.pipelines.QuotePipeline': 300,
'quote.pipelines.MongoPipeline': 300,
}
2.pipeline.py
import pymongo
class MongoPipeline(object):
# 表名字
collection = 'student'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 插入到mongo数据库
self.db[self.collection].insert(dict(item))
return item
6.编写settings.py(设置headers,pipelines等)
robox协议
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
headers
DEFAULT_REQUEST_HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
pipelines
ITEM_PIPELINES = {
'quote.pipelines.QuotePipeline': 300,
}
7.运行爬虫
dos窗口输入:
scrapy crawl myspider
运行结果
来源:oschina
链接:https://my.oschina.net/u/4350354/blog/3622551