# 批量爬取贴吧页面数据 # 网页抓取汉字转码、多个参数拼接 # 第1页: https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=0 # 第2页:https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=50 # 第3页 https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=100 # 第4页 pn=150 # 及格水平---单页爬取 # base_url = "https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=" # for page in range(10): # new_url = base_url + str(page*50) # print(new_url) # 进阶水平--单页爬取 # 从键盘去输入贴吧名称和页数,然后爬取指定页面的内容 base_url = 'https://tieba.baidu.com/f?' name = input("请输入贴吧名称:") page = input("请输入贴吧页数:") # page输入的时候就是字符串 from urllib import request, parse # qs={'kw':name, # 'pn':(int(page)-1)*50} # # qs_data=parse.urlencode(qs) # url=base_url+qs_data # print(url) # # headers={ # 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' # # } # req=request.Request(url,headers=headers) # response=request.urlopen(req) # html=response.read() # html=html.decode('utf-8') # # with open(name+'第'+page+'页'+'.html','w',encoding='utf-8') as f: # f.write(html) # 进阶水平----批量爬取 # 从键盘去输入贴吧名称和页数,然后爬取指定页面的内容 for i in range(int(page)): qs = {'kw': name, 'pn': i * 50} qs_data = parse.urlencode(qs) url = base_url + qs_data print(url) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' } req = request.Request(url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') with open(name + '第' + str(i+1) + '页' + '.html', 'w', encoding='utf-8') as f: f.write(html)
C:\Users\Apple\PycharmProjects\spider\venv\Scripts\python.exe C:/Users/Apple/PycharmProjects/spider/04tieba.py
请输入贴吧名称:旅行青蛙
请输入贴吧页数:2
https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&pn=0
https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&pn=50
Process finished with exit code 0
来源:CSDN
作者:Arthur54271
链接:https://blog.csdn.net/zbrj12345/article/details/79913145