ip代理池(练手)

一世执手 提交于 2020-01-03 22:49:41

利用scrapy框架进行爬取免费代理网站所提供的代理ip

# -*- coding: utf-8 -*-
import random
from time import sleep

import redis
import scrapy

from redis_con import redis_save


class IpPoolSpider(scrapy.Spider):
    name = 'ip_pool'
    allowed_domains = ['www.xiladaili.com/gaoni/']
    start_urls = ['http://www.xiladaili.com/gaoni/']

    def parse(self, response):
        ips = response.xpath('//table[contains(@class,"fl-table")]/tbody/tr/td[1]/text()').getall()

        redis_save(ips)
        next_url = "http://www.xiladaili.com" + response.xpath('//ul[contains(@class,"pagination")]/li[last()]/a/@href').get()
        print(next_url)
        sleep(random.randint(1,10))
        yield scrapy.Request(next_url,callback=self.parse,dont_filter=True)

把爬取到的代理ip存储到redis缓存,并在每次使用之前都进行可达判断,不可达从缓存中将其删掉

from random import choice

import redis

from ip_check import check

redis_client = redis.Redis(host='127.0.0.1',password='123')



def redis_show():
    with open('ips.txt','a+') as f:

        for ip in redis_client.lrange('ips',0,-1):
            f.write(ip + '\n')

    print('数据展示保存在文本完成')



def redis_save(ips):
    for ip in ips:
        redis_client.lpush('ips',ip)
    # redis_show()




def redis_check():
    for ip in redis_client.lrange('ips',0,-1):
        pass


def redis_delete():
    redis_client.ltrim('ips',-1,0)


def redis_out():
    ips = []
    for ip in redis_client.lrange('ips', 0, -1):


        ips.append(ip)

    flag = True
    while flag:
        ip = choice(ips[-1:-200:-1])
        result = check(ip)
        print(result)
        if result:
            flag = False
            return result
        else:
            redis_client.lrem('ips',0,ip)
    # ip = choice(ips)
    # if check(ip):
    #     return ip
    # else:
    #     redis_client.lrem('ips', 0, ip)
if __name__ == '__main__':
    i = 1
    for ip in redis_client.lrange('ips', 0, -1):
        print(i)
        i += 1
        print(ip)
    # redis_delete()


判断ip是否可达

import requests


def check(ip):
    ip = "http://" + str(ip)[2:-1]
    print(ip)

    try:
        results = requests.get('http://www.baidu.com',proxies={"http":ip},timeout = 5)
    except:
        print('失败')
        return False
    else:
        print('ok')

        return ip

在下载中间件进行设置代理ip

class IPTest(MyFirstSpiderDownloaderMiddleware):

    def process_request(self, request, spider):
        #动态ip
        request.meta['proxy'] = redis_out()
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!