56-爬虫-ip代理的使用

拜拜、爱过 提交于 2020-02-08 19:00:17

为了防止频繁访问而导致的IP被服务器封掉的可能,可以使用Ip代理来实现。

使用方法,在requests的函数中加个proxies参数即可:

page_text = requests.get(url=url, headers=headers, proxies={'http': '60.167.132.19:9999'} )

可以使用的免费IP代理可以从如下网站获取:

1.快代理:https://www.kuaidaili.com/free/

2.西刺代理:https://www.xicidaili.com/nn/

网上还有其它的。

下面再给个抓取免费代理的代码,会将快代理网址的ip存入运行文件相同路径下的data文件夹中。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/5 10:42
# @Author  : ystraw
# @Site    : 
# @File    : getIpPool.py
# @Software: PyCharm Community Edition
# @function: 从快代理获取可用的代理IP
# https://www.kuaidaili.com/free/inha/2/

import requests
import time
import random
from bs4 import BeautifulSoup

# 写入文件:
def writeFile(filename, file):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(file)
    print(filename, '已写入!')
    f.close()
# 读入文件:
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已读入!')
    f.close()
    return str

# 获取获取ip池,保存到文件
def download_IP():
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    proxies = { 'HTTP': '223.199.27.122:9999',
                'HTTP': '223.199.30.236:9999' }
    # sess = requests.session()
    # 采集数据:
    IPpool = ''
    # 抓取的页数范围:
    for i in range(1, 5):
        try:
            time.sleep(random.randint(1, 2))
            url = 'https://www.kuaidaili.com/free/inha/'+str(i)+'/'
            print('请求地址:', url)
            response = requests.get(url, headers=headers).text
            # print(response)
            # 提取信息:
            bs = BeautifulSoup(response, 'lxml')
            # 获取ip列表:
            tbody = bs.findAll('tbody')[0]
            trList = tbody.findAll('tr')
            for tr in trList:
                # print(tr)
                tdList = tr.findAll('td')
                for td in tdList:
                    # print(td.string, end=' ')
                    IPpool += td.string + ','
                IPpool += '\n'
        except Exception as ex:
            print('本次爬取ip失败', ex)
        # print(IPpool)
    if len(IPpool) > 3328:
        writeFile('./data/IPpool.txt', IPpool)
    else:
        print('获取数量小于50个,未写入!')

# 读取IP文件并整理
def getIP():
    ipstring = readFile('./data/IPpool.txt')
    ipList = ipstring.split('\n')
    proxies = []  # 'http': '223.199.27.122:9999'
    for ip in ipList:
        if ip == '' or ip == None:
            continue
        ip = ip.split(',')
        try:
            proxies.append((ip[3], ip[0] + ':' + ip[1]))
        except Exception as ex:
            print('ip pool 构建失败!', ex)
    return proxies

# 返回IP池
def getproxies():
    # 从网页获取ip
    # download_IP()
    # 读取保存的ip
    proxies = getIP()
    # [('http', '60.167.132.19:9999'), ('http', '60.167.132.19:9999')]
    return proxies

if __name__ == '__main__':
    # 从网页获取ip
    download_IP()
    # 读取保存的ip
    proxies = getIP()
    # 测试
    url = 'https://www.baidu.com'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}
    for i in range(len(proxies)):
        ip = proxies[i]
        page_text = requests.get(url=url, headers=headers, proxies={ip[0]: ip[1]} )
        print(i, page_text)

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!