IP地址取自国内髙匿代理IP网站:http://www.xicidaili.com/nn/
from bs4 import BeautifulSoup
import requests
from urllib import request
import re
from threading import Thread
ip_availavle = []
def get_headers():
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
"Cookie": '''s_v_web_id=doesntmatter'''}
return headers
def re_search(str):
# 正则匹配
pattern = re.compile(
'<td>([0-9]*?.[0-9]*?.[0-9]*?.[0-9]*?)</td>\n<td>([0-9]{4})</td>', re.S)
return re.findall(pattern, str)
def page_operation(source):
# 爬取ip和端口
soup = BeautifulSoup(source, "html.parser")
html_str = soup.find_all('tr', attrs={'class': ''}) + soup.find_all('tr', attrs={'class': 'odd'})
return re_search(str(html_str))
def get_one_page(url):
response = requests.get(url, headers=get_headers())
return page_operation(response.content)
def get_opener(ip, port):
ip = str(ip) + ':' + str(port)
# 设置代理ip访问方式,http和https
proxy = {'http': 'http://' + ip, 'https': 'https://' + ip}
# 创建ProxyHandler
proxy_support = request.ProxyHandler(proxy)
# 创建Opener
opener = request.build_opener(proxy_support)
return opener
def check_ip(opener=''):
try:
if opener != '':
request.install_opener(opener)
response = request.urlopen("https://www.baiud.com", timeout=5.0)
if response.code == 200:
print("当前ip可用")
return response.code
except Exception as e:
print("当前ip不可用")
return -1
def check_iplist(ip_list):
try:
for current in range(0, len(ip_list)):
ip = ip_list[current][0]
port = ip_list[current][1]
print("检查ip:", ip + ":" + port)
opener = get_opener(ip, port)
result = check_ip(opener)
if result == -1:
ip_list.remove(ip_list[current])
else:
ip_availavle.append(ip_list[current])
return ip_list
except Exception as e:
print(current, len(ip_list))
def get_ip_list_and_check_iplist(page_num=1):
url = 'https://www.xicidaili.com/nt/' + str(page_num)
check_iplist(get_one_page(url))
if __name__ == '__main__':
get_ip_list_and_check_iplist()
我是通过beautiful_soup选取结点,然后再再通过正则表达式来提取ip和端口,
它的结点结构基本是这样子的。爬取后通过设置proxy,连百度看是否能成功,来判断这个代理是否有效,不过爬取的大部分ip都是不行的
来源:CSDN
作者:梦想闹钟
链接:https://blog.csdn.net/qq_43199509/article/details/103793204