# coding=utf-8 import requests from bs4 import BeautifulSoup import time from multiprocessing import Pool import threading from requests.adapters import HTTPAdapter rs = requests.Session() rs.mount('http://', HTTPAdapter(max_retries=30)) rs.mount('https://', HTTPAdapter(max_retries=30)) # monkey.patch_all() # class MyThread(threading.Thread): # """重写多线程,使其能够返回值""" # def __init__(self, target=None, args=()): # super(MyThread, self).__init__() # self.func = target # self.args = args # # def run(self): # self.result = self.func(*self.args) # # def get_result(self): # try: # return self.result # 如果子线程不使用join方法,此处可能会报没有self.result的错误 # except Exception: # return None threads_content = {} # 获取小说内容 def extraction_chapter(id, chapter_url): """获取小说内容""" res = rs.get(chapter_url, timeout=(5, 7)) # print(result) # res.encoding = "gbk" # print (res) soup = BeautifulSoup(res.text, 'lxml') # print(soup) # title = soup.select('div.txtbox > h1')[].text title = soup.select('#txtbox > h1') content = soup.select('#content') # con = title + content title_str = str(title[0]) content_str = str(content[0]) # print(content_str) title_re = title_str.replace('<h1>', '') title_re = title_re.replace('</h1>', '\n') content_re = content_str.replace('<div id="content">', '') content_re = content_re.replace('<p>', '\n\t') content_re = content_re.replace('</p>', '') content_re = content_re.replace('</div>', '') make_sign = "\n\n\t_____(ฅ>ω<*ฅ)喵呜~~~_____\n\n" con = title_re + content_re + make_sign threads_content[id] = con # 获取小说每章网址(已分进程) def extraction(novel_url): # print("+") res = rs.get(novel_url, timeout=(3, 5)) # 输入小说总页面 # 获取元素 soup = BeautifulSoup(res.text, 'lxml') # 寻找书名 novel_title = soup.select('#bookinfo-right>h1') novel_title = str(novel_title[0]) novel_title = novel_title.replace('<h1>', '') novel_title = novel_title.replace('</h1>', '') print(novel_title) chapter_all = soup.select('#list>ul>li>a') # 获取章节所在元素,a标签 # chapter = str(chapter[0].attrs["href"]) # 获取a标签href属性 # print(type(chapter_all)) file_name = novel_title + '.txt' with open(file_name, 'w', encoding='utf-8') as f: f.write('') # content_con = "" threads = [] id = 0 # 遍历拼接每章网址 for chapter in chapter_all: chapter = str(chapter.attrs["href"]) # 获取a标签href属性 chapter_url = novel_url + chapter # 完成拼接 # charpter_con = extraction_chapter(chapter_url) # 调用子方法, 萃取每章内容. # 使用协程提高效率 # charpter_con = gevent.spawn(extraction_chapter, chapter_url) # charpter_con.join() t = threading.Thread(target=extraction_chapter, args=(id, chapter_url,)) threads.append(t) t.start() id += 1 # 遍历所有线程,等待所有线程都完成任务 for t in threads: t.join() # print(content_con) # 遍历线程字典, 导入内容 # i = 0 # value = "" # while i <= len(threads_content): # value = value + threads_content[i] # i += 1 for i in threads_content: with open(file_name, 'a', encoding='utf-8') as f: f.write(threads_content[i]) # threads_content.clear() # 完本页面网址 def end_book(end_url): res = rs.get(end_url, timeout=(3, 5)) # 输入小说总页面 # 获取元素 soup = BeautifulSoup(res.text, 'lxml') # 寻找书籍a元素 novel_name = soup.select('.bookimg>a') po = Pool() for name in novel_name: # 获取每个元素的网址 novel_url = name.attrs["href"] # print(novel_url) # extraction(novel_url) # 把 网址传入方法. # 进程池方式进行,把进程放入进程池 po.apply_async(extraction, (novel_url,)) po.close() po.join() def book(index_url, start, end): num = start while num < end + 1: index = '/index_' + str(num) + '.html' if num == 1: index = "" # 全本书索引页面 index_con = index_url + index # 调用全本方法, 并传入参数 end_book(index_con) print(index_con) num += 1 if __name__ == '__main__': # 输入网址 url = "https://www.yite.cc/quanben" # 此处输入小说总网址 page_start = 95 # 开始页数 page_end = 96 # 结束页数 # 开始时间 start_time = time.time() # 定义进程池, 默认为cpu核数 book(url, page_start, page_end) # 结束时间 end_time = time.time() # 耗时 elapsed = end_time - start_time print('消耗时间:' + str(elapsed))