import requests
import re
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
}
page_url = 'https://wenku.baidu.com/view/830b09f5f90f76c661371ae1.html?sxts=1566282744675'
# 'https://wenku.baidu.com/view/a51a665a77232f60ddcca17a.html?from=search'
# input('请输入要下载的百度文库页面地址:')
doc_types = {
'0': '',
'1': 'doc',
'2': 'xls',
'3': 'ppt',
'4': 'docx',
'5': 'xlsx',
'6': 'pptx',
'7': 'pdf',
'8': 'txt',
'9': 'wps',
'10': 'et',
'11': 'dps',
'12': 'vsd',
'13': 'rtf',
'14': 'pot',
'15': 'pps',
'16': 'epub'
}
page_content = requests.get(page_url, headers=headers).content.decode(encoding='gb2312')
docId = re.split(r":\s*'",re.findall(r"'docId'\s*:\s*'.+'", page_content)[0])[1][:-1]
doc_info_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % docId
doc_info = requests.get(doc_info_url, headers=headers).text
doc_info = json.loads(doc_info.replace('/**/cb(', '')[:-1])
doc_type = doc_types[doc_info['docInfo']['docType']]
doc_title = doc_info['docInfo']['docTitle']
total_page_num = doc_info['docInfo']['totalPageNum']
doc_url = "https://wkretype.bdimg.com/retype/text/%s?%s&callback=cb&pn=1&rn=%s&type=%s&rsign=%s" \
% (docId, doc_info['md5sum'][1:], total_page_num, doc_type, doc_info['rsign'])
doc_result = requests.get(doc_url, headers=headers).content.decode(encoding='gb2312')
doc_result = json.loads(doc_result.replace('cb(', '')[:-1])
if doc_type == 'txt':
for obj in doc_result:
for parag in obj['parags']:
with open('./%s.txt' % doc_title, 'a') as f:
f.writelines(str(parag['c']).encode().decode())
代码示例已经实现了txt资源下载,其他类型一样的处理手法。自己随便处理就可以了!
来源:https://blog.csdn.net/mbh12333/article/details/99884402