利用scrapy框架进行爬取免费代理网站所提供的代理ip
# -*- coding: utf-8 -*-
import random
from time import sleep
import redis
import scrapy
from redis_con import redis_save
class IpPoolSpider(scrapy.Spider):
name = 'ip_pool'
allowed_domains = ['www.xiladaili.com/gaoni/']
start_urls = ['http://www.xiladaili.com/gaoni/']
def parse(self, response):
ips = response.xpath('//table[contains(@class,"fl-table")]/tbody/tr/td[1]/text()').getall()
redis_save(ips)
next_url = "http://www.xiladaili.com" + response.xpath('//ul[contains(@class,"pagination")]/li[last()]/a/@href').get()
print(next_url)
sleep(random.randint(1,10))
yield scrapy.Request(next_url,callback=self.parse,dont_filter=True)
把爬取到的代理ip存储到redis缓存,并在每次使用之前都进行可达判断,不可达从缓存中将其删掉
from random import choice
import redis
from ip_check import check
redis_client = redis.Redis(host='127.0.0.1',password='123')
def redis_show():
with open('ips.txt','a+') as f:
for ip in redis_client.lrange('ips',0,-1):
f.write(ip + '\n')
print('数据展示保存在文本完成')
def redis_save(ips):
for ip in ips:
redis_client.lpush('ips',ip)
# redis_show()
def redis_check():
for ip in redis_client.lrange('ips',0,-1):
pass
def redis_delete():
redis_client.ltrim('ips',-1,0)
def redis_out():
ips = []
for ip in redis_client.lrange('ips', 0, -1):
ips.append(ip)
flag = True
while flag:
ip = choice(ips[-1:-200:-1])
result = check(ip)
print(result)
if result:
flag = False
return result
else:
redis_client.lrem('ips',0,ip)
# ip = choice(ips)
# if check(ip):
# return ip
# else:
# redis_client.lrem('ips', 0, ip)
if __name__ == '__main__':
i = 1
for ip in redis_client.lrange('ips', 0, -1):
print(i)
i += 1
print(ip)
# redis_delete()
判断ip是否可达
import requests
def check(ip):
ip = "http://" + str(ip)[2:-1]
print(ip)
try:
results = requests.get('http://www.baidu.com',proxies={"http":ip},timeout = 5)
except:
print('失败')
return False
else:
print('ok')
return ip
在下载中间件进行设置代理ip
class IPTest(MyFirstSpiderDownloaderMiddleware):
def process_request(self, request, spider):
#动态ip
request.meta['proxy'] = redis_out()
来源:CSDN
作者:扣剑书生
链接:https://blog.csdn.net/weixin_44038167/article/details/103826990