通常情况下爬虫超过一定频率或次数,对应的公网 IP 会被封掉,为了能稳定爬取大量数据,我们一般从淘宝购买大量代理ip,一般 10元 10w IP/天,然而这些 IP 大量都是无效 IP,需要自己不断重试或验证,其实这些 IP 也是卖家从一些代理网站爬下来的。
既然如此,为什么我们不自己动手爬呢?基本思路其实挺简单:
(1)找一个专门的 proxy ip 网站,解析出其中 IP
(2)验证 IP 有效性
(3)存储有效 IP 或者做成服务
一个 demo 如下:
import requests
from bs4 import BeautifulSoup
import re
import socket
import logging
logging.basicConfig(level=logging.DEBUG)
def proxy_spider(page_num):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
for i in range(page_num):
url = 'http://www.xicidaili.com/wt/' + str(i + 1)
r = requests.get(url=url, headers=headers)
html = r.text
# print r.status_code
soup = BeautifulSoup(html, "html.parser")
datas = soup.find_all(name='tr', attrs={'class': re.compile('|[^odd]')})
for data in datas:
soup_proxy = BeautifulSoup(str(data), "html.parser")
proxy_contents = soup_proxy.find_all(name='td')
ip_org = str(proxy_contents[1].string)
ip = ip_org
port = str(proxy_contents[2].string)
protocol = str(proxy_contents[5].string)
wan_proxy_check(ip, port, protocol)
# print(ip, port, protocol)
def local_proxy_check(ip, port, protocol):
proxy = {}
proxy[protocol.lower()] = '%s:%s' % (ip, port)
# print proxy
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.settimeout(1)
s.connect((ip, int(port)))
s.shutdown(2)
logging.debug("{} {}".format(ip, port))
return True
except:
logging.debug("-------- {} {}".format(ip, port))
return False
"""
几种在Linux下查询外网IP的办法
https://my.oschina.net/epstar/blog/513186
各大巨头电商提供的IP库API接口-新浪、搜狐、阿里
http://zhaoshijie.iteye.com/blog/2205033
"""
def wan_proxy_check(ip, port, protocol):
proxy = {}
proxy[protocol.lower()] = '%s:%s' % (ip, port)
# proxy = {protocol:protocol+ "://" +ip + ":" + port}
# print(proxy)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
try:
result = requests.get("http://pv.sohu.com/cityjson", headers=headers, proxies=proxy, timeout=1).text.strip(
"\n")
wan_ip = re.findall(r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b", result)[0]
if wan_ip == ip:
logging.info("{} {} {}".format(protocol, wan_ip, port))
logging.debug("========================")
else:
logging.debug("//// Porxy bad: {} {}".format(wan_ip, port))
except Exception as e:
logging.debug("#### Exception: {}".format(str(e)))
if __name__ == '__main__':
proxy_spider(1)
Refer:
[1] Python爬虫代理IP池(proxy pool)
https://github.com/jhao104/proxy_pool
[2] Python爬虫代理IP池
http://www.spiderpy.cn/blog/detail/13
[3] python ip proxy tool scrapy crawl. 抓取大量免费代理 ip,提取有效 ip 使用
https://github.com/awolfly9/IPProxyTool
来源:oschina
链接:https://my.oschina.net/u/568818/blog/67349