tip:
大致思路:从网络(URL:http://ip.jiangxianli.com/api/proxy_ips)获取代理ip以及端口,存到列表;随机一个数从列表中取一个ip,设置超市时间以及次数;捕获异常,从列表中删除无用的ip代理并设置新的ip代理
settings.py中还要开启下载中间件
DOWNLOADER_MIDDLEWARES = { 'tianmao.middlewares.TestDownloaderMiddleware': 543, }
code:
from scrapy import signals import requests import json, random class TestDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self): # 获取请求到的ip内容 res = requests.get('http://ip.jiangxianli.com/api/proxy_ips') # 存入列表 self.ip_list = json.loads(res.content)['data']['data'] self.random_int = 1 print('init method is running ...') @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # print(request.meta) print('process request is running ...') # 获取代理 self.get_proxy(request) return None def process_response(self, request, response, spider): print('process_response is running ...') # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain print('exception is %s' % exception) if exception: # 从ip_list删除之前的ip_dic self.ip_list.pop(self.random_int) # 再次选一个IP地址,返回request request = self.get_proxy(request) return request def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) def get_proxy(self, request): num_ip = len(self.ip_list) print('现在总共有%d个ip地址' % num_ip) # 随机一个ip self.random_int = random.randint(0, num_ip) print('随机到的整数是%d' % self.random_int) ip_dic = self.ip_list[self.random_int+1] print('随机到的ip地址是:%s' % ip_dic) ip = ip_dic.get('ip') port = ip_dic.get('port') ip_address = 'http://' + ip + ':' + port # 设置代理 request.meta['proxy'] = ip_address # 设置最大请求时间 request.meta['download_timeout'] = 5 return request