python3 爬虫百度贴吧
import urllib.request import urllib.parse from lxml import etree def loadPage(url): """ 作用:根据url发送请求,获取服务器响应文件 url: 需要爬取的url地址 """ #print url #美女 # headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0)AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} request = urllib.request.Request(url) html = urllib.request.urlopen(request).read() # 解析HTML文档为HTML DOM模型 content = etree.HTML(html) #print content # 返回所有匹配成功的列表集合 link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') #link_list = content.xpath('//a[@class="j_th_tit"]/@href') for