python爬虫 | 易学教程

一、python模块

1、urllib.request常规使用

import urllib.request

#1、指定URL
url = 'http://www.baidu.com'
#2、发起请求：urlopen可以根据指定的url发起请求，且返回一个响应对象
response = urllib.request.urlopen(url=url)
#3、获取页面数据：read函数返回的就是响应对象中存储的页面数据（byte）
response_text = response.read()
#4、持久化存储
with open('./baidu.html','wb') as f:
    f.write(response_text)
    print('写入成功')

urllib.request

urllib.request处理url中文

import urllib.request
import urllib.parse

#1、指定URL
# url = 'https://tieba.baidu.com/f?ie=utf-8&kw=你好&fr=search'
url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%s&fr=search'
#url不可以存在非ascii编码的字符数据
wd = urllib.parse.quote("你好")
new_url = format(url % wd)
print(new_url)
#2、发起请求：urlopen可以根据指定的url发起请求，且返回一个响应对象
response = urllib.request.urlopen(url=new_url)
#3、获取页面数据：read函数返回的就是响应对象中存储的页面数据（byte）
response_text = response.read()
#4、持久化存储
with open('./baidu1.html','wb') as f:
    f.write(response_text)
    print('写入成功')

urllib.request

urllib.request伪装请求头User-Agent

import urllib.request

#1、指定URL
url = 'http://www.baidu.com'

#自制定一个请求对象
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}
#该请求对象的UA进行了伪装
request = urllib.request.Request(url=url,headers=headers)

#2、发起请求：urlopen根据自制定的request发起请求，且返回一个响应对象
response = urllib.request.urlopen(request)
#3、获取页面数据：read函数返回的就是响应对象中存储的页面数据（byte）
response_text = response.read()
#4、持久化存储
with open('./baidu.html','wb') as f:
    f.write(response_text)
    print('写入成功')

urllib.request

urllib.request发送post请求

import urllib.request
import urllib.parse

#1、指定URL
url = 'https://fanyi.baidu.com/sug'

#请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}

#该请求对象的UA进行了伪装
request = urllib.request.Request(url=url,headers=headers)

#将post请求参数封装到字典
data = {
    'kw':'你好',
}
#使用parse模块对字典进行编码处理
data = urllib.parse.urlencode(data)
#将data转换成byte类型
data = data.encode()

#2、发起请求：urlopen根据自制定的request发起请求，且返回一个响应对象
response = urllib.request.urlopen(request,data=data)
#3、获取页面数据：read函数返回的就是响应对象中存储的页面数据（byte）
response_text = response.read()
#4、持久化存储
with open('./baidu.html','wb') as f:
    f.write(response_text)
    print('写入成功')

urllib.request

urllib.request使用代理

import urllib.request

#创建处理器对象，在其内部封装代理ip和端口
handler = urllib.request.ProxyHandler(proxies={"http":"118.172.211.3:58535"})
#创建opener对象，然后使用该对象发起一个请求
opener = urllib.request.build_opener(handler)

#1、指定URL
url = 'http://www.baidu.com/s?ie=utf-8&wd=ip'


#该请求对象的UA进行了伪装
request = urllib.request.Request(url=url)


#2、发起请求：urlopen根据自制定的request发起请求，且返回一个响应对象
response = opener.open(request)
#3、获取页面数据：read函数返回的就是响应对象中存储的页面数据（byte）
response = response.read()
#4、持久化存储
with open('./baiduip.html','wb') as f:
    f.write(response)
    print('写入成功')

使用代理

urllib.request请求携带cookie(未经验证)

#使用cookiejar实现人人网的登录
import urllib.request
import urllib.parse
import http.cookiejar
#请求中的cookie会自动存储到cj对象中
cj = http.cookiejar.CookieJar()
#创建携带cookiejar对象的处理器对象
handler = urllib.request.HTTPCookieProcessor(cj)
#创建opener对象，（携带handler对象）
opener = urllib.request.build_opener(handler)

#要让cookiejar获取请求中的cookie数据值
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018110912156'
#自定义一个请求对象，让该对象作为opener的open函数中的参数
data={
    "email": "aaa",
    "icode": "",
    "origURL": "http://www.renren.com/home",
    "domain": "renren.com",
    "key_id": 1,
    "captcha_type": "web_login",
    "password": "1ae3b707652ccb767340abb542af0616c42fc942beb275069939d6c4cc461e5c",
    "rkey": "948db2d8639bcd2664994c49454256d1",
    "f":"" ,
}
data = urllib.parse.urlencode(data).encode()
request = urllib.request.Request(url,data=data)
opener.open(request)

#获取当前用户的二级子页面
s_url = 'http://www.renren-inc.com/zh/product/renren.html'

#该次请求中就携带了cookie
response = opener.open(s_url)
with open('./renren.html','wb') as f:
    f.write(response.read())

urllib.request

2、requests (pip install requests)

requests发送get请求

import requests

#1、指定url
url = 'https://www.baidu.com'

#2、发起get请求，返回响应对象
response = requests.get(url=url)

#人为指定编码格式为utf-8
# response.encoding = 'utf-8'

#3、获取响应对象值 ，.text为str，content为byte
response_content = response.content

with open('./re3.html',"wb") as f:
    f.write(response_content)

requests

response常用属性

import requests

#1、指定url
url = 'https://www.baidu.com'

#2、发起get请求，返回响应对象
response = requests.get(url=url)

#获取str类型值
# print(response.text)

#获取bytes值
# print(response.content)

#返回一个响应状态码
print(response.status_code)

#返回一个响应头信息
print(response.headers)

#返回请求的url
print(response.url)

View Code

requests发送带参数和自定制headers的get请求

方式一：url中直接带参，如url = 'https://www.baidu.com/s?ie=utf-8&wd=你好'

方式二：使用params

import requests

#1、指定url
url = 'https://tieba.baidu.com/f'

#定义get参数
params = {
    "ie": "utf-8",
    "kw": "你好",
    "fr": "search",
}

#自定制头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}

#2、发起get请求，传params参，指定headers头，返回响应对象
response = requests.get(url=url,params=params,headers=headers)


#3、获取响应对象值 ，.text为str，content为byte
response_content = response.content

#打印请求url
print(response.url)

with open('./re3.html',"wb") as f:
    f.write(response_content)

View Code

requests发送post请求

import requests

#1、指定url
url = 'https://accounts.douban.com/login'

#定义post参数
data = {
    "source": "movie",
    "redir": "https://movie.douban.com/",
    "from_email": "xxx",
    "from_password":"xxx",
    "login":"登录"，
}

#自定制头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}

#2、发起post请求
response = requests.post(url=url,data=data,headers=headers)


#3、获取响应对象值 ，.text为str，content为byte
response_content = response.content

#打印请求url
print(response.url)

with open('./re3.html',"wb") as f:
    f.write(response_content)

View Code

requests的cookie操作

import requests

session = requests.session()

#1、指定url
url = 'https://accounts.douban.com/login'

#定义post参数
data = {
    "source": "movie",
    "redir": "https://movie.douban.com/",
    "from_email": "xxx",
    "from_password":"xxx",
    "login":"登录"，
}

#自定制头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}

#2、使用session发起post请求,将cookie获取并存储到session对象中
login_response = session.post(url=url,data=data,headers=headers)

#对个人主页发起请求（session(cookie)），获取响应页面数据
url = 'https://www.douban.com/people/xxxx/'
response = session.get(url=url,headers=headers)

#3、获取响应对象值 ，.text为str，content为byte
response_content = response.content

#打印请求url
print(response.url)

with open('./re3.html',"wb") as f:
    f.write(response_content)

View Code

requests的代理操作

免费代理ip的网站提供商：www.goubanjia.com、快代理、西祠代理

import requests

#1、指定url  ****注意，url的协议要和代理协议保持一致，如都使用http或https
url = 'http://www.baidu.com/s?ie=utf-8&wd=ip'
#自定制头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
#自定制代理ip
proxy={
    "http":"118.172.211.3:58535"
}
#发送请求
response = requests.get(url=url,proxies=proxy,headers=headers)

#3、获取响应对象值 ，.text为str，content为byte
response_content = response.content

#4、数据持久化
with open('./re3.html',"wb") as f:
    f.write(response_content)
    print("写入成功")

View Code

3、selenium

安装：pip install selenium

下载浏览器对应驱动：

谷歌浏览器：http://chromedriver.storage.googleapis.com/index.html

版本对应表：https://blog.csdn.net/huilan_same/article/details/51896672

编码流程：

from selenium import webdriver
from time import sleep
#创建一个浏览器对象
bro = webdriver.Chrome('./chromedriver.exe')
#get方法可以指定一个url,让浏览器进行请求
bro.get('https://www.baidu.com')
sleep(1)
#让百度进行指定词条的一个搜索
text = bro.find_element_by_id('kw')#定位到text文本框
text.send_keys('人民币')#send_keys表示向文本框中录入指定内容
sleep(1)
button = bro.find_element_by_id('su')
button.click()#click表示的是点击操作
sleep(3)
bro.quit()#关闭浏览器

View Code

使用PhantomJS无界面浏览器

下载PhantomJS：http://phantomjs.org/download.html

编码流程：

from selenium import webdriver
import time
#创建一个浏览器对象
bro = webdriver.PhantomJS('./phantomjs-2.1.1-windows/bin/phantomjs.exe')
#get方法可以指定一个url,让浏览器进行请求
bro.get('https://www.baidu.com')

#截屏
bro.save_screenshot('./1.png')

#让百度进行指定词条的一个搜索
text = bro.find_element_by_id('kw')#定位到text文本框
text.send_keys('人民币')#send_keys表示向文本框中录入指定内容

#截屏
bro.save_screenshot('./2.png')

button = bro.find_element_by_id('su')
button.click()#click表示的是点击操作

#等待两秒，待页面加载完成
time.sleep(2)

#截屏
bro.save_screenshot('./3.png')

bro.quit()#关闭浏览器

PhantomJS

补充：让页面中的滚轮向下滑动（底部）

js = 'window.scrollTo(0,document.body.scrollHeight)'
#浏览器对象执行js代码
bro.execute_script(js)

二、框架(scrapy)

1、安装

If you’re using Anaconda or Miniconda, you can install the package from the conda-forge channel, which has up-to-date packages for Linux, Windows and OS X.

conda install -c conda-forge scrapy

安装成功界面如下：

windowns安装方式二：

1、pip install wheel

2、下载twisted:

　　https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted

　　如下：按自己对应的版本下载

　　安装：

　　　　pip install **.whl

3、pip install pywin32

4、pip install scrapy

pycharm修改python环境，使pycharm支持scrapy：

file-->settings->如下图：

　　1、

　　2、

2、创建项目

cmd进入到想要创建项目的目录下，执行：scrapy startproject projectname

创建成功界面如下：

3、创建爬虫程序

a、创建basic爬虫，cmd进入到项目目录

b、创建crawl爬虫，cmd进入到项目目录

4、前戏

a、注释爬虫文件中的allowed_domains

b、settings.py第22行，ROBOTSTXT_OBEY = True改为ROBOTSTXT_OBEY = False

c、settings.py第19行，改为USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

d、开启管道：67-69行，

ITEM_PIPELINES = {   'mytestscrapy.pipelines.MytestscrapyPipeline': 300,}

持久化存储操作：

一、磁盘文件

1、基于终端指令

　　保证parse方法返回一个可迭代类型的对象（存储解析到的页面内容）

　　使用终端指令完成数据存储到指定磁盘文件中的操作

　　scrapy crawl 爬虫文件名称 -o 磁盘文件.后缀（xxoo.csv）

2、基于管道

　　将解析到的页面数据存储到items对象

　　使用yield关键字将items提交给管道文件进行处理

def parse(self, response):
    item = MytestscrapyItem()
    item.author = 'hg'
    item['content'] = 'hg_content'
    yield item

　　在管道文件中编写代码完成数据存储的操作

class MytestscrapyPipeline(object):
    def open_spider(self,spider):
        self.f = open('t.txt','w')
    def process_item(self, item, spider):
        author = item['author']
        content = item['content']
        str = "author:"+author+" "+"content:"+content
        self.f.write(str)

        return item
    def close_spider(self,spider):
        self.f.close()

　　在配置文件中开启管道操作（settings.py 67-69行）

ITEM_PIPELINES = {
   'mytestscrapy.pipelines.MytestscrapyPipeline': 300,
}

二、数据库

1、mysql

import pymysql

class MytestscrapyPipeline(object):
    def open_spider(self,spider):
        self.db = pymysql.connect("localhost","root","123456","scrapy_test" )
    def process_item(self, item, spider):
        author = item['author']
        content = item['content']
        sql = 'insert into ac(author,content) VALUES ("hg","hg_content")'
        cursor = self.db.cursor()
        try:
            cursor.execute(sql)
            self.db.commit()
        except:
            self.db.rollback()

2、redis

三、实战

1、发送post请求

class PostspiderdemoSpider(scrapy.Spider):
    name = 'postSpiderDemo'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://fanyi.baidu.com/sug']

    def start_requests(self):
        for url in self.start_urls:
            #post请求方式一：
            # scrapy.Request(url=url,callback=self.parse,method='post')
            #post请求方式二：（推荐）
            data = {
                "kw": "dog"
            }
            yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
    def parse(self, response):
        print(response.text)

2、cookie操作

发送post请求时会自动保存cookie，之后再发get请求会自动携带。

3、代理

（1）、middlewares.py文件新建类，如：

class MyProxy(object):
    def process_request(self, request, spider):
        request.meta['proxy'] = "http://198.1.122.29:80"

（2）、settings.py（56-58行）文件取消中间件注释：如：

DOWNLOADER_MIDDLEWARES = {
   'postDemo.middlewares.MyProxy': 543,
}

4、日志等级和写入日志到文件，编辑settings.py文件，增加如下两行：

LOG_LEVEL = 'ERROR'
LOG_FILE = 'log.txt'

5、请求传参

class RequestArgsSpider(scrapy.Spider):
    name = 'request_args'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.hao6v.com/gvod/zx.html']

    def movie_details(self,response):
        item = response.meta['item']
        print(item['name'])
        print(item['i'])
    i=0
    def parse(self, response):
        li_list = response.xpath('//*[@id="main"]/div[1]/div/ul/li')
        print(len(li_list))
        for li in li_list:
            name = li.xpath('./a//text()').extract_first()
            url = li.xpath('./a/@href').extract_first()
            item = PostdemoItem()
            self.i += 1
            item['name'] = name
            item['i'] = self.i
            #请求传参
            yield scrapy.Request(url=url,callback=self.movie_details,meta={'item':item})

6、CrawlSpider

class TestcrawlSpider(CrawlSpider):
    name = 'testcrawl'
    # allowed_domains = ['https://www.baidu.com']
    start_urls = ['https://dig.chouti.com/']

    rules = (
        Rule(LinkExtractor(allow=r'/all/hot/recent/\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print(response)

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import time,random
from mytestscrapy.items import MytestscrapyItem

class TestcrawlSpider(CrawlSpider):
    name = 'testcrawl'
    # allowed_domains = ['https://www.baidu.com']
    start_urls = ['https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'thread0806.php\?fid=2&search=&page=[0-9]{1,2}$'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print(response.url)
        time.sleep(random.randint(2,4))
        if response.url == 'https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1':
            tr_ele = response.xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]')[2:]
        else:
            tr_ele=response.xpath('//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]')

        for tr in tr_ele:
            count = tr.xpath('./td[4]/text()').extract_first()
            if int(count) < 5:
                continue
            text = tr.xpath('./td[2]//a/text()').extract_first()
            url = 'https://cc.yyss.icu/'+tr.xpath('./td[2]//a/@href').extract_first()
            item = MytestscrapyItem()
            item['urlname'] = text
            item['urladdr'] = url
            item['commentsNum'] = count
            yield item

7、RedisCrawlSpider分布式爬虫

（1）概念：多台机器上可以执行同一个爬虫程序，实现网站数据的分布爬取。

（2）原生的scrapy是不可以实现分布式爬虫的

　　a:调度器无法共享

　　b:管道无法共享

（3）scrapy-redis组件：专门为scrapy开发的一套组件，该组件可以让scrapy实现分布式

　　a:下载，pip install scrapy-redis

(4) 分布式爬取的流程

　　a:redis配置文件的配置

　　　　注释bind 127.0.0.1

　　　　将protected-mode yes改为 no

　　b:redis服务器的开启：基于配置文件

　　c:创建scrapy工程后，创建基于crawlSpider的爬虫文件

　　d:导入RedisCrawlSpider类，然后将爬虫文件修改成基于该类的源文件

　　e:将start_url修改成redis_key = 'xxx'

　　f:将项目的管道和调度器配置基于成基于scrapy-redis组件

　　g:执行爬虫文件：scrapy runspider xxx.py

　　h:将起始url放置到调度器的队列中：redis-cli:lpush 队列的名称（redis-key）起始url

步骤：

a、创建项目：

　　1、scrapy startproject RedisCrawlSpider

　　2、scrapy genspider -t crawl redis_crawl_spider www.baidu.com

b、安装scrapy-redis

　　1、pip install scrapy-redis

c、启动redis服务

　　1、 ./src/redis-server redis.conf

d、redis_crawl_spider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from RedisCrawlSpider.items import RediscrawlspiderItem

#继承RedisCrawlSpider
class RedisCrawlSpiderSpider(RedisCrawlSpider):
    name = 'redis_crawl_spider'
    # allowed_domains = ['www.baidu.com']
    # start_urls = ['http://www.baidu.com/']
    redis_key = 'qiubaispider'
    rules = (
        Rule(LinkExtractor(allow=r'/pic/page/\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        div_list = response.xpath('//div[@id="content-left"]/div')
        for div in div_list:
            img_url = "https:"+div.xpath('.//div[@class = "thumb"]/a/img/@src').extract_first()
            item = RediscrawlspiderItem()
            item['img_url'] = img_url
            yield item

e、settings.py

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   # 'RedisCrawlSpider.pipelines.RediscrawlspiderPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 使用scrapy-redis里的去重组件，不使用scrapy默认的去重方式
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis里的调度器组件，不使用默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 允许暂停，redis请求记录不丢失
SCHEDULER_PERSIST = True

# 指定数据库的主机IP
REDIS_HOST = "192.168.9.4"
# 指定数据库的端口号
REDIS_PORT = 6379

# 指定数据库密码：REDIS_PARAMS = {"password":"123456"}

8、RedisSpider分布式爬虫

　　a、代码修改（爬虫类）：

　　　　导包：from scrapy_redis.spiders import RedisSpider

　　　　将爬虫类的父类修改成RedisSpider

　　　　将起始url列表注释，添加一个redis_key(调度器队列的名称)的属性　

# -*- coding: utf-8 -*-
import scrapy
import time
from selenium import webdriver
from wangyipro.items import WangyiproItem
from scrapy_redis.spiders import RedisSpider

class WangyispiderSpider(RedisSpider):
    name = 'wangyispider'
    # allowed_domains = ['https://news.163.com/']
    # start_urls = ['https://news.163.com/']
    redis_key = "wangyi"
    def __init__(self):
        #实例化一个浏览器对象（实例化一次）
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        self.bro = webdriver.Chrome('./chromedriver.exe',chrome_options=options)

    def closed(self,spider):
        print("爬虫结束")
        self.bro.quit()

    def parse(self, response):
        menu_list = ["menu_guonei","menu_guoji","menu_war","menu_hangkong"]
        li_list = []
        for menu in menu_list:
            li = response.xpath("//div[@class='ns_area list']/ul/li[@class=$val]",val=menu)
            li_list.append(li)
        for li in li_list:
            title = li.xpath('./a/text()').extract_first()
            url = li.xpath('./a/@href').extract_first()
            yield scrapy.Request(url=url,callback=self.secondPage,meta={"title":title})

    def secondPage(self,response):
        title = response.meta['title']
        div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
        for div in div_list:
            head = div.xpath('.//h3/a/text()').extract_first()
            url = div.xpath('.//h3/a/@href').extract_first()
            imgUrl = div.xpath('./a[@class="na_pic"]/img/@src').extract_first()
            tag_list = div.xpath('.//div[@class="news_tag"]/div/a/text()').extract()
            for tag in tag_list:
                tag = tag.strip()
            tag = ",".join(tag_list)
            item = WangyiproItem()
            item['title'] = title
            item['head'] = head
            item['url'] = url
            item['imgUrl'] = imgUrl
            item['tag'] = tag
            yield scrapy.Request(url=url,callback=self.getContent,meta={"item":item})

    def getContent(self,response):
        item = response.meta['item']
        content_list = response.xpath('//div[@id="endText"]/p//text()').extract()
        for content in content_list:
            content.strip()
        content = "\n\t".join(content_list)
        item['content'] = content
        yield item

View Code

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from scrapy.http import HtmlResponse
import time
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random
#UA池
user_agent_list = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    #safari 5.1 – Windows
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    #IE 9.0
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
    #IE 8.0
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    #IE 7.0
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    #IE 6.0
    " Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    # Firefox 4.0.1 – MAC
    " Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    # Firefox 4.0.1 – Windows
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    # Opera 11.11 – MAC
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    # Opera 11.11 – Windows
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    # Chrome 17.0 – MAC
    " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    # 傲游（Maxthon）
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    # 腾讯TT
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    # 世界之窗（The World） 2.x
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    # 世界之窗（The World） 3.x
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    # 搜狗浏览器 1.x
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    # 360浏览器
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    # Avant
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    # Green Browser
    " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    # 移动设备端：
    # safari iOS 4.33 – iPhone
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    # safari iOS 4.33 – iPod Touch
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    # safari iOS 4.33 – iPad
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    # Android N1
    " Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    # Android QQ浏览器 For android
    " MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    # Android Opera Mobile
    " Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    # Android Pad Moto Xoom
    " Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    # BlackBerry
    " Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    # WebOS HP Touchpad
    " Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    # Nokia N97
    " Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    # Windows Phone Mango
    " Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    # UC无
    " UCWEB7.0.2.37/28/999",
    # UC标准
    " NOKIA5700/ UCWEB7.0.2.37/28/999",
    # UCOpenwave
    " Openwave/ UCWEB7.0.2.37/28/999",
    # UC Opera
    " Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
]

class RandomUserAgent(UserAgentMiddleware):
    def process_request(self, request, spider):
        ua = random.choice(user_agent_list)
        request.headers.setdefault('User-Agent',ua)

#代理池
proxies_http = [
    "182.53.197.24:56383",
    "78.58.136.55:39232",
    "188.17.156.26:30773",
]
proxies_https = [
    "103.19.110.177:8080",
    "193.33.101.152:34611",
    "119.176.80.220:9999",
]

class Proxy(object):
    def process_request(self, request, spider):
        ph = request.url.split(":")[0]
        if ph == "http":
            proxy = random.choice(proxies_http)
            request.meta['proxy'] = 'http://'+proxy
        else:
            proxy = random.choice(proxies_https)
            request.meta['proxy'] = 'https://' + proxy



class WangyiproSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class WangyiproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    #拦截到响应对象（下载器传递给Spider的响应对象）
    #request:响应对象对应的请求对象
    #response:拦截到的响应对象
    #spider:爬虫文件中对应的爬虫类的实例
    def process_response(self, request, response, spider):
        if request.url in ["http://news.163.com/domestic/","http://news.163.com/world/","http://war.163.com/","http://news.163.com/air/"]:
            spider.bro.get(url = request.url)
            #将浏览器拖到最底部
            # js = 'window.scrollTo(0,document.body.scrollHeight)'
            # 浏览器对象执行js代码
            # spider.bro.execute_script(js)
            # time.sleep(3)
            page_text = spider.bro.page_source
            print(page_text)
            return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
        else:
            return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

middlewares.py

　　b、修改redis.conf配置文件，并指定配置文件启动redis服务

　　　　注释bind 127.0.0.1

　　　　protected-mode yes改为 no

　　c、修改settings.py文件　　　　

ITEM_PIPELINES = {
   # 'RedisCrawlSpider.pipelines.RediscrawlspiderPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 使用scrapy-redis里的去重组件，不使用scrapy默认的去重方式
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis里的调度器组件，不使用默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 允许暂停，redis请求记录不丢失
SCHEDULER_PERSIST = True

# 指定数据库的主机IP
REDIS_HOST = "192.168.9.4"
# 指定数据库的端口号
REDIS_PORT = 6379
# 指定数据库密码
REDIS_PARAMS = {"password":"123456"}

　　d:执行爬虫文件：scrapy runspider xxx.py

　　e:将起始url放置到调度器的队列中：redis-cli:lpush 队列的名称（redis-key）起始url

来源：https://www.cnblogs.com/hougang/p/spider.html

标签

mozilla