Python爬虫框架 | 易学教程

本文章的源代码来源于https://github.com/Holit/Web-Crawler-Framwork

一、爬虫框架的代码

  1 import urllib.request
  2 from bs4 import BeautifulSoup
  3 import re
  4 import time
  5 import _thread
  6 
  7 # Input your Url here####################################
  8 BaseURL = '127.0.0.1/'
  9 #########################################################
 10 TaxURL = ".html"
 11 
 12 #Input your data-saving path ############################
 13 SavePath = ""
 14 #########################################################
 15 
 16 #Input your threads count ###############################
 17 thread_count = 1
 18 #########################################################
 19 
 20 #Set each spider will spy how many pages ################
 21 thread_spy_count_ench = 5
 22 #########################################################
 23 def mkdir(path):
 24     # Create the directory
 25     import os
 26     path=path.strip()
 27     path=path.rstrip("\\")
 28     isExists=os.path.exists(path)
 29     if not isExists:
 30         os.makedirs(path)
 31         return True
 32     else:
 33         return False
 34 
 35 def download(start, count):
 36     #Spider main
 37     for i in range(start,start + count):
 38         try:
 39             #DEBUG##################################################
 40             #print("[INFO] Connecting to page #" + str(i) + "...")
 41             ########################################################
 42             
 43             #Used to record time
 44             time_start=time.time()
 45             
 46             #Construct url
 47             #This only work like
 48             # https://127.0.0.1/articles/00001.html
 49             # https://127.0.0.1/articles/00002.html
 50             # https://127.0.0.1/articles/00003.html
 51             TargetURL = BaseURL + str(i) + TaxURL
 52             
 53             
 54             #create Request object
 55             req = urllib.request.Request(TargetURL)
 56             #create headers using general header, you could find this by Fiddler(R) or by Chrome(R)
 57             req.add_header('Host','')    #Your Host, usally set as url-base
 58             req.add_header('Referer',TargetURL)        #Your Referer, usally set as url
 59             req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19')
 60             #finishing create Request object
 61             
 62             #get information
 63             res = urllib.request.urlopen(req)
 64             #decode the html
 65             soup = BeautifulSoup(res,"html.parser")
 66             ##############################################################
 67             #add your functions  here....
 68             
 69             #operate_data(data)
 70             
 71             #soup find div to get inforamtion.
 72             #soup is able to opreate html tag very easily, by using soup.find(...)
 73             ##############################################################
 74             
 75             #Change saving path here.
 76             savetarget = SavePath
 77             
 78             #trying to saving files
 79             try:
 80                 #create directory if it doesn't existed
 81                 mkdir(SavePath+"\\"+str(zone)+"\\")
 82                 #using open...
 83                 f = open(savetarget,'w')
 84                 
 85                 #edit this
 86                 f.write("data")
 87                 
 88             except Exception as e:
 89                 time_end=time.time()
 90                 print("  [Failed] - #" + str(i) + " Error : " + str(e))
 91             else:
 92             
 93                 time_end=time.time()
 94                 print("  [Succeed] - #" + str(i) + " has saved to path.("+str(time_end-time_start)+"s)")
 95 
 96             pass
 97         except Exception as e:
 98             print("  [Global Failure] - #" + str(i) + " Error : " + str(e))
 99             pass
100 
101 
102 
103 #if __name__ == __main__:
104 try:
105     #Multithreading
106     print("Spidering webiste...")
107     print("Current configuration :")
108     print("--Will create " + str(thread_count) + "threads to access.")
109     print("--Will save to " + SavePath)
110     print("-------------START---------------------------")
111     # press any key to continue
112     # this won't work under linux
113     import os
114     os.system('pause')
115     try:
116         for i in range(0,thread_count):
117             print("[Thread #"+ str (i) +"] started successfully")
118             _thread.start_new_thread(download, (thread_spy_count_ench * i,thread_spy_count_ench))
119     except Exception as e:
120         print("[Threading@" + str(i) +"] Error:"+ str(e))
121 except Exception as e:
122    print("[Global Failure] Error:"+ str(e))
123 while 1:
124    pass

二、对其中功能的实例化操作

　　1.文本获取功能

　　　　文本获取是指对页面的<div class='content'>...</div>中的内容进行获取，这是前提。如果不同需要更改。

　　　　（1）思路

　　　　　　使用BeautifulSoup对html分析之后得到解码的文件，例如

1             <div class="content" style="text-align: left">
2             基础内容
3             </div>

　　　　　　现在对该段落进行选取，即使用soup.find功能

　　　　（2）基本代码

 1 passages_div = soup.find('div')
 2 passages_set = passages_div.findAll(attrs={"class":"content"})
 3 for passages in passages_set:
 4     article = str(passages)
 5     #文字处理
 6     article = article.replace('<div class="content" style="text-align: left">', '')
 7     article = article.replace(u'\ue505', u' ')#对Unicode的空格进行处理，如果不处理gbk无法编码
 8     article = article.replace(u'\ue4c6', u' ')
 9     article = article.replace(u'\xa0', u' ')
10     article = article.replace('<br/>', '\n')
11     article = article.replace('</div>', '')
12     savetarget = 'D:\test\test.txt'
13     try:
14         mkdir('D:\test\')
15         f = open(savetarget,'w')
16         f.write(article)
17     except Exception as e:
18         print("  [Failed] - "+ str(e))
19     else:
20         time_end=time.time()
21         print("  [Succeed] - saved to path.")
22 
23 pass