爬虫 爬取ip池代理

烈酒焚心 提交于 2020-01-01 17:40:50
 IP地址取自国内髙匿代理IP网站:http://www.xicidaili.com/nn/
from bs4 import BeautifulSoup
import requests
from urllib import request
import re
from threading import Thread

ip_availavle = []


def get_headers():
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
        "Cookie": '''s_v_web_id=doesntmatter'''}
    return headers


def re_search(str):
    # 正则匹配
    pattern = re.compile(
        '<td>([0-9]*?.[0-9]*?.[0-9]*?.[0-9]*?)</td>\n<td>([0-9]{4})</td>', re.S)
    return re.findall(pattern, str)


def page_operation(source):
    # 爬取ip和端口
    soup = BeautifulSoup(source, "html.parser")
    html_str = soup.find_all('tr', attrs={'class': ''}) + soup.find_all('tr', attrs={'class': 'odd'})
    return re_search(str(html_str))


def get_one_page(url):
    response = requests.get(url, headers=get_headers())
    return page_operation(response.content)


def get_opener(ip, port):
    ip = str(ip) + ':' + str(port)
    # 设置代理ip访问方式,http和https
    proxy = {'http': 'http://' + ip, 'https': 'https://' + ip}
    # 创建ProxyHandler
    proxy_support = request.ProxyHandler(proxy)
    # 创建Opener
    opener = request.build_opener(proxy_support)
    return opener


def check_ip(opener=''):
    try:
        if opener != '':
            request.install_opener(opener)
        response = request.urlopen("https://www.baiud.com", timeout=5.0)
        if response.code == 200:
            print("当前ip可用")
        return response.code
    except Exception as e:
        print("当前ip不可用")
        return -1


def check_iplist(ip_list):
    try:
        for current in range(0, len(ip_list)):
            ip = ip_list[current][0]
            port = ip_list[current][1]
            print("检查ip:", ip + ":" + port)
            opener = get_opener(ip, port)
            result = check_ip(opener)
            if result == -1:
                ip_list.remove(ip_list[current])
            else:
                ip_availavle.append(ip_list[current])
        return ip_list
    except Exception as e:
        print(current, len(ip_list))

def get_ip_list_and_check_iplist(page_num=1):
    url = 'https://www.xicidaili.com/nt/' + str(page_num)
    check_iplist(get_one_page(url))

if __name__ == '__main__':
     get_ip_list_and_check_iplist()

我是通过beautiful_soup选取结点,然后再再通过正则表达式来提取ip和端口,
在这里插入图片描述
它的结点结构基本是这样子的。爬取后通过设置proxy,连百度看是否能成功,来判断这个代理是否有效,不过爬取的大部分ip都是不行的
在这里插入图片描述

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!