Python实现 百度文库资源下载

给你一囗甜甜゛ 提交于 2019-11-28 01:25:41
import requests
import re
import json

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
}

page_url = 'https://wenku.baidu.com/view/830b09f5f90f76c661371ae1.html?sxts=1566282744675'
    # 'https://wenku.baidu.com/view/a51a665a77232f60ddcca17a.html?from=search'
# input('请输入要下载的百度文库页面地址:')
doc_types = {
    '0': '',
    '1': 'doc',
    '2': 'xls',
    '3': 'ppt',
    '4': 'docx',
    '5': 'xlsx',
    '6': 'pptx',
    '7': 'pdf',
    '8': 'txt',
    '9': 'wps',
    '10': 'et',
    '11': 'dps',
    '12': 'vsd',
    '13': 'rtf',
    '14': 'pot',
    '15': 'pps',
    '16': 'epub'
}
page_content = requests.get(page_url, headers=headers).content.decode(encoding='gb2312')
docId = re.split(r":\s*'",re.findall(r"'docId'\s*:\s*'.+'", page_content)[0])[1][:-1]

doc_info_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % docId
doc_info = requests.get(doc_info_url, headers=headers).text
doc_info = json.loads(doc_info.replace('/**/cb(', '')[:-1])

doc_type = doc_types[doc_info['docInfo']['docType']]
doc_title = doc_info['docInfo']['docTitle']
total_page_num = doc_info['docInfo']['totalPageNum']

doc_url = "https://wkretype.bdimg.com/retype/text/%s?%s&callback=cb&pn=1&rn=%s&type=%s&rsign=%s" \
          % (docId, doc_info['md5sum'][1:], total_page_num, doc_type, doc_info['rsign'])
doc_result = requests.get(doc_url, headers=headers).content.decode(encoding='gb2312')
doc_result = json.loads(doc_result.replace('cb(', '')[:-1])



if doc_type == 'txt':
    for obj in doc_result:
        for parag in obj['parags']:
            with open('./%s.txt' % doc_title, 'a') as f:
                f.writelines(str(parag['c']).encode().decode())

代码示例已经实现了txt资源下载,其他类型一样的处理手法。自己随便处理就可以了!

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!