简单的爬取小说的脚本
1 ''' 2 爬取网站 顶点小说 3 网站地址 https://www.booktxt.net 4 本脚本只为学习 5 ''' 6 import requests 7 from bs4 import BeautifulSoup 8 import time,random 9 10 book_name = '5_5626' #小说名字的编号 11 book_url = 'https://www.booktxt.net' + '/' + book_name + '/' #拼接小说地址) 12 response = requests.get(url= book_url) 13 14 response.encoding = response.apparent_encoding #转码 15 soup = BeautifulSoup(response.text, features='html.parser') 16 a = soup.find(id='list') 17 dd_all = a.find_all('dd') 18 http_all = [] 19 20 for i in dd_all: 21 http_all.append(book_url + i.find('a').attrs.get('href')) 22 http_all = http_all[8:] #从开头开始截取都为7章 23 m = 5 #测试限定爬取次数 24 with open(book_name+'.txt', 'w') as f: 25 n = 0 #计数 26 for i in http_all: 27 if m==n:break 28 h = requests.get(url=i) 29 h.encoding = h.apparent_encoding 30 hb = BeautifulSoup(h.text, features='html.parser') 31 tar_t = hb.find(id='content') 32 tar_h = hb.find("h1").text 33 f.write(tar_h+'\n') 34 for j in tar_t: 35 if str(j)!="<br/>": 36 f.write(str(j).lstrip()+'\n') 37 time.sleep(random.randint(3, 6))#增加爬取时间间隔,防止被封ip 38 n+=1 39 f.write('\n\n') 40 print('第%d章写入完成!'%n) 41 f.close()
来源:https://www.cnblogs.com/MMTTBD/p/10514261.html