关于简单的小说爬取
import requests #requests是一个HTTP请求库
from pyquery import PyQuery#网页解析 原生CSS解析器 css层叠样式表
# 章节url name书名
def get_one_chapter(chapter_url=None,name=None,shunxu=None):
'''
:param chapter_url: 章节url
:param name: 书名
:return: None
'''
response = requests.get(url=chapter_url) #
doc = PyQuery(response.text)
title = doc("h1").text()
content1 = doc("#nr1").text()
print(shunxu,title,content1)
num = str(shunxu)
with open(file= num+title + ".txt", mode='a+', encoding='utf_8') as f:
# 里面能写入文字,不能写入对象列表
f.write(title + " \n\n " + content1)
# 获取整本书的url 获取书名
def get_index():
#目录
index_url = 'https://www.luoxia.com/qing/'#目录地址
text = requests.get(url=index_url).text
doc = PyQuery(text) #对象解析
# print(doc)
# ****************获取links的所有链接********************/
links = doc('#content-list a') #连接标签
name = doc("h1").text() #连接标签
shunxu = 0
for link in list(links.items())[9:]:
shunxu = shunxu+1
# print(l)
chapter_url =link.attr.href
print(chapter_url)
get_one_chapter(chapter_url=chapter_url, name=name,shunxu=shunxu)
get_index()
简单的实战练习
来源:CSDN
作者:tingyushuo_
链接:https://blog.csdn.net/tingyushuo_/article/details/103747654