我们都知道在爬取网页信息时很容易遭到网站的封禁,这时我们就要用到代理IP
以下是模拟浏览器访问网页的代码
在自定义函getHtml()中可以得到解析后的网页信息
_headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.xicidaili.com', 'If-None-Match':'W/"b077743016dc54409ebe6b86ba7a869b"', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', } _cookies = None def getHtml(page): url = "https://www.xicidaili.com/nn/"+str(page) //page代表页数 result = requests.get(url,headers=_headers).text html = etree.HTML(result) return html
然后开始爬取数据
使用xpath得到想到数据
ips = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[2]/text()') //IP地址 duans = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[3]/text()') //端口号 types = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[6]/text()') //类型
我将之保存到txt文件
以下是完整代码
#coding:utf-8 import requests from lxml import etree from urllib import request _headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.xicidaili.com', 'If-None-Match':'W/"b077743016dc54409ebe6b86ba7a869b"', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', } _cookies = None def getHtml(page): url = "https://www.xicidaili.com/nn/"+str(page) result = requests.get(url,headers=_headers).text html = etree.HTML(result) return html file = open('ip.txt', 'w', encoding='utf-8') for j in range(1,10): print("正在爬取第"+str(j)+"页IP") try: for html in getHtml(j): ips = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[2]/text()') duans = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[3]/text()') types = html.xpath('(//tr[@class="odd"]|//tr[@class=""])/td[6]/text()') for i in range(0,len(ips)): file.write(types[i]) file.write(' , ') file.write(duans[i]) file.write(' , ') file.write(ips[i]) file.write('\n') print(types[i],duans[i],ips[i]) except: pass file.close()
文章来源: https://blog.csdn.net/qq_37076942/article/details/90766984