python 爬网络免费小说

家住魔仙堡 提交于 2020-02-26 11:06:22
第一次写博客,记录下自己用python爬小说
思路是先获取所有章节的url,再分别爬取生成文件
先开始小说章节页面,收集章节的url
    html = urllib.request.urlopen("http://www.5k5m.com/book/0/426771/").read()
    html = html.decode("gbk")  # 转成该网址的格式
    reg = r'<dd><a href="(.*?)">(.*?)</a></dd>'  #  (.*?)表示取全部
    reg = re.compile(reg)  #compile()与findall()一起使用,返回一个列表
    urls = re.findall(reg, html)
可以看到页面有两部分 最新章节和正文,爬下来的时候需要区分

在这里插入图片描述

    flag=1
    while(flag):
        if ' target=' in urls[0][0]:
            urls.pop(0)#当作list处理
        else:
            flag=0

接下来是多线程部分,一开始没用多线程的时候过段时间会服务器断开连接,所以后来改用了多线程

    charts_url = []
    for url in urls:
        # print(url)
        chapter_title=url[1]
        # print(chapter_title)
        chapter_url = "http://www.5k5m.com"+url[0]  # 章节的超链接
        # print(chapter_url)
        charts_url.append(chapter_url)
    p = multiprocessing.Pool()
    num=1
    for i in charts_url:
        p.apply_async(thread_getBook, args=(i,num))
        num=num+1
    print('等待所有的章节被加载......' )
    p.close()
    p.join()
    end = time.time()
    print('开始生成 ................' )
    return

接下来进章节看一下
在这里插入图片描述
可以看到每个章节分了页 ,所以需要再分别进入每页去爬取
在这里插入图片描述
因为每次都会截取到这段广告,所以我通过BeautifulSoup辨别id的方式把这段话摘出来

def thread_getBook(chapter_url,chapter_title):
    chapter_html = urllib.request.urlopen(chapter_url).read()  # 正文内容源代码
    chapter_html = chapter_html.decode("utf-8")
    chapter_reg = r'<div class="chapter">(.*?)</div>'
    chapter_reg = re.compile(chapter_reg, re.S)
    chapter_content = re.findall(chapter_reg, chapter_html)

    soup = BeautifulSoup(chapter_html, "html.parser")
    b = soup.select('#p-hed')

    file_name = str(chapter_title)+ '.txt'
    print(file_name)
    with open(file_name ,'a') as f:
        for content in chapter_content:#chapter_content长度为1,但属于list,需要提出来
            a = Replace(content,str(b[0]))
        f.write(a + getMore(chapter_html))       
    f.close()
def getMore(html):
    soup = BeautifulSoup(html, "html.parser")
    a=soup.select('#btnNext')
    b=soup.select('#p-hed')
    if("下一页")in a[0]:#避免截取到“下一章”
        for i in a:
            url = i.attrs['href']#attrs提取
        # print(url)
        chapter_url = "http://m.5k5m.com" + url
        chapter_html = urllib.request.urlopen(chapter_url).read() 
        chapter_html = chapter_html.decode("utf-8")
        # print(chapter_html)
        chapter_reg = r'<div class="chapter">(.*?)</div>'
        chapter_reg = re.compile(chapter_reg, re.S)
        chapter_content = re.findall(chapter_reg, chapter_html)
        for content in chapter_content:  # 打印每页的内容
            res=Replace(content,str(b[0]))
        return res+getMore(chapter_html )
    else : return " "

完整代码

import time
import urllib.request
import urllib
import urllib.request
import multiprocessing
from bs4 import BeautifulSoup
import re
import os



# 定义一个爬取网络小说的函数
def Replace(content,str):
    content = content.replace(str, " ")
    content = content.replace("</p>\n<p> ", "")
    content = content.replace("</p>", " ")
    content = content.replace("<p>", " ")
    return content


def getMore(html):
    soup = BeautifulSoup(html, "html.parser")
    a=soup.select('#btnNext')
    b=soup.select('#p-hed')
    if("下一页")in a[0]:
        for i in a:
            url = i.attrs['href']
        # print(url)
        chapter_url = "http://m.5k5m.com" + url
        chapter_html = urllib.request.urlopen(chapter_url).read()  # 正文内容源代码
        chapter_html = chapter_html.decode("utf-8")
        # print(chapter_html)
        chapter_reg = r'<div class="chapter">(.*?)</div>'
        chapter_reg = re.compile(chapter_reg, re.S)
        chapter_content = re.findall(chapter_reg, chapter_html)
        for content in chapter_content:  # 打印章节的内容
            res=Replace(content,str(b[0]))
        return res+getMore(chapter_html )
    else : return " "

def thread_getBook(chapter_url,chapter_title):
    chapter_html = urllib.request.urlopen(chapter_url).read()  # 正文内容源代码
    chapter_html = chapter_html.decode("utf-8")
    chapter_reg = r'<div class="chapter">(.*?)</div>'
    chapter_reg = re.compile(chapter_reg, re.S)
    chapter_content = re.findall(chapter_reg, chapter_html)
    print(type(chapter_content))

    soup = BeautifulSoup(chapter_html, "html.parser")
    b = soup.select('#p-hed')

    file_name = str(chapter_title)+ '.txt'
    print(file_name)
    with open(file_name ,'a') as f:
        for content in chapter_content:
            a = Replace(content,str(b[0]))
        f.write(a + getMore(chapter_html))
    f.close()
    # f = open('{}.txt'.format(chapter_title), 'w')  # 保存到文本
    # f.write(a)


def getNovelContent():
    html = urllib.request.urlopen("http://www.5k5m.com/book/0/426771/").read()
    html = html.decode("gbk")  # 转成该网址的格式
    reg = r'<dd><a href="(.*?)">(.*?)</a></dd>'  # 正则表达的匹配
    reg = re.compile(reg)  # 可添加可不添加,增加效率
    urls = re.findall(reg, html)
    flag=1
    while(flag):
        if ' target=' in urls[0][0]:
            urls.pop(0)
        else:
            flag=0
    charts_url = []
    for url in urls:
        # print(url)
        chapter_title=url[1]
        # print(chapter_title)
        chapter_url = "http://www.5k5m.com"+url[0]  # 章节的超链接
        # print(chapter_url)
        charts_url.append(chapter_url)
    p = multiprocessing.Pool()
    num=1
    for i in charts_url:
        p.apply_async(thread_getBook, args=(i,num))
        num=num+1
    print('等待所有的章节被加载......' )
    p.close()
    p.join()
    end = time.time()
    print('开始生成 %s ................' )
    return


if __name__=='__main__':
    getNovelContent()


在这里插入图片描述
只是一个新手,很多地方还有不足,需要完善,希望自己多多进步

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!