1、下载 Fiddler 安装
2、第一步已完成,入门开始吧
首先确保有微信客户端(推荐PC,移动端会多一些操作)
启动微信、Fiddler ,然后找到需要抓取的公众号(还是关注一下吧,都抓取别人的数据了),然后在微信右上角有三个小圆点,左击会有查看历史消息,打开历史消息,然后在Fiddler 中找到对应的请求数据,把整个header都复制了(后期优化),然后通过python 的实现功能代码
代码示例
import requests, sys,os
from bs4 import BeautifulSoup as bs, Comment
from selenium import webdriver
# from base import mysql_conn as db
# import random, json, threading
# import re, difflib
import time
# 模拟 手动滑动
def scroll_to_bottom(driver):
js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = driver.execute_script(js)
while height < new_height:
# 将滚动条调整至页面底部
for i in range(height, new_height, 100):
driver.execute_script('window.scrollTo(0, {})'.format(i))
time.sleep(2)
height = new_height
time.sleep(2)
new_height = driver.execute_script(js)
if __name__ == '__main__':
# 请求参数
url_param_dict = {
'__biz' : "MzU4OTgxODcwNw==",
'hid' : "1",
'sn' : "f130302b06c94bab6df121bdd3eaeb75",
'scene' : "18",
}
url_param_str = ""
for param_item in url_param_dict.items():
param_item = list(param_item)
url_param_str += "&"+param_item[0]+"="+param_item[1]
# obj_url = "http://mp.weixin.qq.com/mp/homepage?__biz=MzU4OTgxODcwNw==&hid=1&sn=f130302b06c94bab6df121bdd3eaeb75&scene=18"
obj_url = "http://mp.weixin.qq.com/mp/homepage?"+ url_param_str
# 由于不同公众号的地址差异比较大,后期整理
obj_url = "http://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzU0MTk5OTM2NA==&scene=124&uin=MTgyMDMwNDI1OA%3D%3D&key=37f7e61b23960f2e95e3c72f5f605fbdef18f7d012162e7b1ae1ba55d31906087b528216e36185b868cd1d4325536b540abe5e3062f41b8ff70883a5b8227ec700fad1811b68d8f785e414aabf0bfe33&devicetype=Windows+10&version=62060833&lang=zh_CN&a8scene=7&pass_ticket=macvS30urM6yA%2FOk9CnsIu0yqEefKjpQFBiFt4ijoL1fjkS0lqJMfUW%2BPSAiptiZ&winzoom=1 HTTP/1.1"
print(obj_url)
headers = {"Host": "mp.weixin.qq.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
"Cookie": "rewardsn=; wxtokenkey=777; wxuin=1820304258; devicetype=android-27; version=27000f51; lang=zh_CN; pass_ticket=macvS30urM6yA/Ok9CnsIu0yqEefKjpQFBiFt4ijoL1fjkS0lqJMfUWPSAiptiZ; wap_sid2=CILH/uMGEnBFNmxKcHZkUzZfMTBQZjVfLTZKeWVYdEtKZEtpOWh6X01zSUZnQWdIMGQ4SjdDUXhqV2p3VnJ1bGtZRmplMUYyZ2NiNS1IVDdySndKNUYyaTlGMHRHVmhtUTZKMTZKMHppblRZV0R5QWdPOHNCQUFBMOq69vcFOA1AlU4="
}
# 通过 python request 请求, 缺点: 无法通过异步加载下一页
# request = requests.get(obj_url, headers=headers)
# soup_info = bs(request.content, features="html.parser")
# 通过 selenium
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
for header_item in headers:
header_item = list(header_item)
option.add_argument(header_item[0]+'=' + header_item[1])
browser = webdriver.Chrome(chromedriver, options=option)
browser.get(obj_url)
time.sleep(5)
scroll_to_bottom(browser)
try:
# 使用BeautifulSoup解析搜索结果
soup_info = bs(browser.page_source, features="html.parser")
print(soup_info)
# time.sleep(200)
except :
print("连接网络超时,窗口即将关闭")
browser.execute_script('window.stop()') # 超出时间则不加载
# 退出/结束浏览器
browser.quit()
示例代码中的header数据会有时效性(网上说的,还没有验证),还没有解决,诶…………
搬砖,搬砖…………
来源:oschina
链接:https://my.oschina.net/u/3268486/blog/4332968