利用Beautiful Soup4和requests库来爬取小说内容:
使用bs4的CSS选择器时要根据对应网页的标签内容而定。
# *-* coding:utf-8 *-*
# 爬取 落霞小说网-<余庆年>
from bs4 import BeautifulSoup
import requests
html = requests.get('https://www.luoxia.com/qing/')
soup = BeautifulSoup(html.text, 'html.parser')
chapters = soup.select("div ul li a")
count = 0
for chapter in chapters:
print(chapter.string)
count += 1
if 5 == count: # 爬取5章
break
novel = requests.get(chapter['href']) # 获得对应章节的html代码
newSoup = BeautifulSoup(novel.text, 'html.parser')
with open('novel.txt', 'a+', encoding='utf-8') as f:
print(chapter.string + '\n', file=f)
words = newSoup.select("div#nr1 p") # 爬取小说文本
for word in words:
if word.string != None: # 如果有内容才写入文件
print(word.string, file=f)
print('-------------------------------------\n', file=f)
部分输出:
来源:CSDN
作者:ASCE_S
链接:https://blog.csdn.net/ASCE_S/article/details/104045389