本文章的源代码来源于https://github.com/Holit/Web-Crawler-Framwork
一、爬虫框架的代码
1 import urllib.request
2 from bs4 import BeautifulSoup
3 import re
4 import time
5 import _thread
6
7 # Input your Url here####################################
8 BaseURL = '127.0.0.1/'
9 #########################################################
10 TaxURL = ".html"
11
12 #Input your data-saving path ############################
13 SavePath = ""
14 #########################################################
15
16 #Input your threads count ###############################
17 thread_count = 1
18 #########################################################
19
20 #Set each spider will spy how many pages ################
21 thread_spy_count_ench = 5
22 #########################################################
23 def mkdir(path):
24 # Create the directory
25 import os
26 path=path.strip()
27 path=path.rstrip("\\")
28 isExists=os.path.exists(path)
29 if not isExists:
30 os.makedirs(path)
31 return True
32 else:
33 return False
34
35 def download(start, count):
36 #Spider main
37 for i in range(start,start + count):
38 try:
39 #DEBUG##################################################
40 #print("[INFO] Connecting to page #" + str(i) + "...")
41 ########################################################
42
43 #Used to record time
44 time_start=time.time()
45
46 #Construct url
47 #This only work like
48 # https://127.0.0.1/articles/00001.html
49 # https://127.0.0.1/articles/00002.html
50 # https://127.0.0.1/articles/00003.html
51 TargetURL = BaseURL + str(i) + TaxURL
52
53
54 #create Request object
55 req = urllib.request.Request(TargetURL)
56 #create headers using general header, you could find this by Fiddler(R) or by Chrome(R)
57 req.add_header('Host','') #Your Host, usally set as url-base
58 req.add_header('Referer',TargetURL) #Your Referer, usally set as url
59 req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19')
60 #finishing create Request object
61
62 #get information
63 res = urllib.request.urlopen(req)
64 #decode the html
65 soup = BeautifulSoup(res,"html.parser")
66 ##############################################################
67 #add your functions here....
68
69 #operate_data(data)
70
71 #soup find div to get inforamtion.
72 #soup is able to opreate html tag very easily, by using soup.find(...)
73 ##############################################################
74
75 #Change saving path here.
76 savetarget = SavePath
77
78 #trying to saving files
79 try:
80 #create directory if it doesn't existed
81 mkdir(SavePath+"\\"+str(zone)+"\\")
82 #using open...
83 f = open(savetarget,'w')
84
85 #edit this
86 f.write("data")
87
88 except Exception as e:
89 time_end=time.time()
90 print(" [Failed] - #" + str(i) + " Error : " + str(e))
91 else:
92
93 time_end=time.time()
94 print(" [Succeed] - #" + str(i) + " has saved to path.("+str(time_end-time_start)+"s)")
95
96 pass
97 except Exception as e:
98 print(" [Global Failure] - #" + str(i) + " Error : " + str(e))
99 pass
100
101
102
103 #if __name__ == __main__:
104 try:
105 #Multithreading
106 print("Spidering webiste...")
107 print("Current configuration :")
108 print("--Will create " + str(thread_count) + "threads to access.")
109 print("--Will save to " + SavePath)
110 print("-------------START---------------------------")
111 # press any key to continue
112 # this won't work under linux
113 import os
114 os.system('pause')
115 try:
116 for i in range(0,thread_count):
117 print("[Thread #"+ str (i) +"] started successfully")
118 _thread.start_new_thread(download, (thread_spy_count_ench * i,thread_spy_count_ench))
119 except Exception as e:
120 print("[Threading@" + str(i) +"] Error:"+ str(e))
121 except Exception as e:
122 print("[Global Failure] Error:"+ str(e))
123 while 1:
124 pass
二、对其中功能的实例化操作
1.文本获取功能
文本获取是指对页面的<div class='content'>...</div>中的内容进行获取,这是前提。如果不同需要更改。
(1)思路
使用BeautifulSoup对html分析之后得到解码的文件,例如
1 <div class="content" style="text-align: left">
2 基础内容
3 </div>
现在对该段落进行选取,即使用soup.find功能
(2)基本代码
1 passages_div = soup.find('div')
2 passages_set = passages_div.findAll(attrs={"class":"content"})
3 for passages in passages_set:
4 article = str(passages)
5 #文字处理
6 article = article.replace('<div class="content" style="text-align: left">', '')
7 article = article.replace(u'\ue505', u' ')#对Unicode的空格进行处理,如果不处理gbk无法编码
8 article = article.replace(u'\ue4c6', u' ')
9 article = article.replace(u'\xa0', u' ')
10 article = article.replace('<br/>', '\n')
11 article = article.replace('</div>', '')
12 savetarget = 'D:\test\test.txt'
13 try:
14 mkdir('D:\test\')
15 f = open(savetarget,'w')
16 f.write(article)
17 except Exception as e:
18 print(" [Failed] - "+ str(e))
19 else:
20 time_end=time.time()
21 print(" [Succeed] - saved to path.")
22
23 pass
2.图片获取操作
图片获取一般是通过对网页上的<img src="127.0.0.1/png.png">Hello</img>中src上的内容进行下载操作
目前可以使用多种操作方式,例如urlretrieve,不再赘述
来源:oschina
链接:https://my.oschina.net/u/4368331/blog/3459732