第一次写博客,记录下自己用python爬小说
思路是先获取所有章节的url,再分别爬取生成文件
先开始小说章节页面,收集章节的url
html = urllib.request.urlopen("http://www.5k5m.com/book/0/426771/").read()
html = html.decode("gbk") # 转成该网址的格式
reg = r'<dd><a href="(.*?)">(.*?)</a></dd>' # (.*?)表示取全部
reg = re.compile(reg) #compile()与findall()一起使用,返回一个列表
urls = re.findall(reg, html)
可以看到页面有两部分 最新章节和正文,爬下来的时候需要区分
flag=1
while(flag):
if ' target=' in urls[0][0]:
urls.pop(0)#当作list处理
else:
flag=0
接下来是多线程部分,一开始没用多线程的时候过段时间会服务器断开连接,所以后来改用了多线程
charts_url = []
for url in urls:
# print(url)
chapter_title=url[1]
# print(chapter_title)
chapter_url = "http://www.5k5m.com"+url[0] # 章节的超链接
# print(chapter_url)
charts_url.append(chapter_url)
p = multiprocessing.Pool()
num=1
for i in charts_url:
p.apply_async(thread_getBook, args=(i,num))
num=num+1
print('等待所有的章节被加载......' )
p.close()
p.join()
end = time.time()
print('开始生成 ................' )
return
接下来进章节看一下
可以看到每个章节分了页 ,所以需要再分别进入每页去爬取
因为每次都会截取到这段广告,所以我通过BeautifulSoup辨别id的方式把这段话摘出来
def thread_getBook(chapter_url,chapter_title):
chapter_html = urllib.request.urlopen(chapter_url).read() # 正文内容源代码
chapter_html = chapter_html.decode("utf-8")
chapter_reg = r'<div class="chapter">(.*?)</div>'
chapter_reg = re.compile(chapter_reg, re.S)
chapter_content = re.findall(chapter_reg, chapter_html)
soup = BeautifulSoup(chapter_html, "html.parser")
b = soup.select('#p-hed')
file_name = str(chapter_title)+ '.txt'
print(file_name)
with open(file_name ,'a') as f:
for content in chapter_content:#chapter_content长度为1,但属于list,需要提出来
a = Replace(content,str(b[0]))
f.write(a + getMore(chapter_html))
f.close()
def getMore(html):
soup = BeautifulSoup(html, "html.parser")
a=soup.select('#btnNext')
b=soup.select('#p-hed')
if("下一页")in a[0]:#避免截取到“下一章”
for i in a:
url = i.attrs['href']#attrs提取
# print(url)
chapter_url = "http://m.5k5m.com" + url
chapter_html = urllib.request.urlopen(chapter_url).read()
chapter_html = chapter_html.decode("utf-8")
# print(chapter_html)
chapter_reg = r'<div class="chapter">(.*?)</div>'
chapter_reg = re.compile(chapter_reg, re.S)
chapter_content = re.findall(chapter_reg, chapter_html)
for content in chapter_content: # 打印每页的内容
res=Replace(content,str(b[0]))
return res+getMore(chapter_html )
else : return " "
完整代码
import time
import urllib.request
import urllib
import urllib.request
import multiprocessing
from bs4 import BeautifulSoup
import re
import os
# 定义一个爬取网络小说的函数
def Replace(content,str):
content = content.replace(str, " ")
content = content.replace("</p>\n<p> ", "")
content = content.replace("</p>", " ")
content = content.replace("<p>", " ")
return content
def getMore(html):
soup = BeautifulSoup(html, "html.parser")
a=soup.select('#btnNext')
b=soup.select('#p-hed')
if("下一页")in a[0]:
for i in a:
url = i.attrs['href']
# print(url)
chapter_url = "http://m.5k5m.com" + url
chapter_html = urllib.request.urlopen(chapter_url).read() # 正文内容源代码
chapter_html = chapter_html.decode("utf-8")
# print(chapter_html)
chapter_reg = r'<div class="chapter">(.*?)</div>'
chapter_reg = re.compile(chapter_reg, re.S)
chapter_content = re.findall(chapter_reg, chapter_html)
for content in chapter_content: # 打印章节的内容
res=Replace(content,str(b[0]))
return res+getMore(chapter_html )
else : return " "
def thread_getBook(chapter_url,chapter_title):
chapter_html = urllib.request.urlopen(chapter_url).read() # 正文内容源代码
chapter_html = chapter_html.decode("utf-8")
chapter_reg = r'<div class="chapter">(.*?)</div>'
chapter_reg = re.compile(chapter_reg, re.S)
chapter_content = re.findall(chapter_reg, chapter_html)
print(type(chapter_content))
soup = BeautifulSoup(chapter_html, "html.parser")
b = soup.select('#p-hed')
file_name = str(chapter_title)+ '.txt'
print(file_name)
with open(file_name ,'a') as f:
for content in chapter_content:
a = Replace(content,str(b[0]))
f.write(a + getMore(chapter_html))
f.close()
# f = open('{}.txt'.format(chapter_title), 'w') # 保存到文本
# f.write(a)
def getNovelContent():
html = urllib.request.urlopen("http://www.5k5m.com/book/0/426771/").read()
html = html.decode("gbk") # 转成该网址的格式
reg = r'<dd><a href="(.*?)">(.*?)</a></dd>' # 正则表达的匹配
reg = re.compile(reg) # 可添加可不添加,增加效率
urls = re.findall(reg, html)
flag=1
while(flag):
if ' target=' in urls[0][0]:
urls.pop(0)
else:
flag=0
charts_url = []
for url in urls:
# print(url)
chapter_title=url[1]
# print(chapter_title)
chapter_url = "http://www.5k5m.com"+url[0] # 章节的超链接
# print(chapter_url)
charts_url.append(chapter_url)
p = multiprocessing.Pool()
num=1
for i in charts_url:
p.apply_async(thread_getBook, args=(i,num))
num=num+1
print('等待所有的章节被加载......' )
p.close()
p.join()
end = time.time()
print('开始生成 %s ................' )
return
if __name__=='__main__':
getNovelContent()
只是一个新手,很多地方还有不足,需要完善,希望自己多多进步
来源:CSDN
作者:不靠谱星球的原住民小w
链接:https://blog.csdn.net/tjdxwwj/article/details/104510127