python 抓取 微信公众号文章

夙愿已清 提交于 2020-10-04 04:37:01

1、下载  Fiddler 安装

  具体操作传送门 

2、第一步已完成,入门开始吧

    首先确保有微信客户端(推荐PC,移动端会多一些操作)

    启动微信、Fiddler ,然后找到需要抓取的公众号(还是关注一下吧,都抓取别人的数据了),然后在微信右上角有三个小圆点,左击会有查看历史消息,打开历史消息,然后在Fiddler 中找到对应的请求数据,把整个header都复制了(后期优化),然后通过python 的实现功能代码

代码示例


import requests, sys,os
from bs4 import BeautifulSoup as bs, Comment
from selenium import webdriver
# from base import mysql_conn as db
# import random, json, threading
# import re, difflib
import time



# 模拟 手动滑动
def scroll_to_bottom(driver):
    js = "return action=document.body.scrollHeight"
    # 初始化现在滚动条所在高度为0
    height = 0
    # 当前窗口总高度
    new_height = driver.execute_script(js)

    while height < new_height:
        # 将滚动条调整至页面底部
        for i in range(height, new_height, 100):
            driver.execute_script('window.scrollTo(0, {})'.format(i))
            time.sleep(2)
        height = new_height
        time.sleep(2)
        new_height = driver.execute_script(js)

if __name__ == '__main__':

    # 请求参数
    url_param_dict = {
        '__biz' : "MzU4OTgxODcwNw==",
        'hid' : "1",
        'sn' : "f130302b06c94bab6df121bdd3eaeb75",
        'scene' : "18",
    }


    url_param_str = ""
    for param_item in url_param_dict.items():
        param_item = list(param_item)
        url_param_str += "&"+param_item[0]+"="+param_item[1]
    # obj_url = "http://mp.weixin.qq.com/mp/homepage?__biz=MzU4OTgxODcwNw==&hid=1&sn=f130302b06c94bab6df121bdd3eaeb75&scene=18"
    obj_url = "http://mp.weixin.qq.com/mp/homepage?"+ url_param_str

    # 由于不同公众号的地址差异比较大,后期整理
    obj_url = "http://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzU0MTk5OTM2NA==&scene=124&uin=MTgyMDMwNDI1OA%3D%3D&key=37f7e61b23960f2e95e3c72f5f605fbdef18f7d012162e7b1ae1ba55d31906087b528216e36185b868cd1d4325536b540abe5e3062f41b8ff70883a5b8227ec700fad1811b68d8f785e414aabf0bfe33&devicetype=Windows+10&version=62060833&lang=zh_CN&a8scene=7&pass_ticket=macvS30urM6yA%2FOk9CnsIu0yqEefKjpQFBiFt4ijoL1fjkS0lqJMfUW%2BPSAiptiZ&winzoom=1 HTTP/1.1"
    print(obj_url)

    headers = {"Host": "mp.weixin.qq.com",
                "Connection": "keep-alive",
                "Cache-Control": "max-age=0",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
                "Cookie": "rewardsn=; wxtokenkey=777; wxuin=1820304258; devicetype=android-27; version=27000f51; lang=zh_CN; pass_ticket=macvS30urM6yA/Ok9CnsIu0yqEefKjpQFBiFt4ijoL1fjkS0lqJMfUWPSAiptiZ; wap_sid2=CILH/uMGEnBFNmxKcHZkUzZfMTBQZjVfLTZKeWVYdEtKZEtpOWh6X01zSUZnQWdIMGQ4SjdDUXhqV2p3VnJ1bGtZRmplMUYyZ2NiNS1IVDdySndKNUYyaTlGMHRHVmhtUTZKMTZKMHppblRZV0R5QWdPOHNCQUFBMOq69vcFOA1AlU4="
}

    # 通过 python request 请求, 缺点: 无法通过异步加载下一页
    # request = requests.get(obj_url, headers=headers)
    # soup_info = bs(request.content, features="html.parser")


    # 通过 selenium
    chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    os.environ["webdriver.chrome.driver"] = chromedriver


    from selenium.webdriver import ChromeOptions
    option = ChromeOptions()

    for header_item in headers:
        header_item = list(header_item)
        option.add_argument(header_item[0]+'=' + header_item[1])

    browser = webdriver.Chrome(chromedriver, options=option)
    browser.get(obj_url)


    time.sleep(5)
    scroll_to_bottom(browser)

    try:
        # 使用BeautifulSoup解析搜索结果
        soup_info = bs(browser.page_source, features="html.parser")
        print(soup_info)


        # time.sleep(200)
    except :
        print("连接网络超时,窗口即将关闭")
        browser.execute_script('window.stop()')  # 超出时间则不加载

    
    # 退出/结束浏览器
    browser.quit()

示例代码中的header数据会有时效性(网上说的,还没有验证),还没有解决,诶…………

 

搬砖,搬砖…………

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!