项目1:实现豆瓣电影TOP250标题爬取:
1 from urllib.request import urlopen 2 from bs4 import BeautifulSoup 3 import re 4 5 class doubanSpider(): 6 def __init__(self): 7 """ 8 初始化, 9 页码,URL,存储数据, 10 """ 11 self.page = 0; 12 # "http://movie.douban.com/top250?start=25&filter=&type=" 第二页 13 # 第一页 14 self.cur_url = "http://movie.douban.com/top250?start=0&filter=&type=" 15 self.datas = [] 16 17 def claw(self): 18 while self.page<10: 19 self.downloadURL() 20 self.updateURL() 21 self.output() 22 23 def updateURL(self): 24 self.page+=1 25 self.cur_url.replace("start=0","start="+str(self.page*25)) 26 27 def downloadURL(self): 28 html = urlopen(self.cur_url) 29 bsObj = BeautifulSoup(html,"html.parser") 30 datas = bsObj.findAll("span", {"class": "title"}) 31 for data in datas: 32 str = data.get_text() 33 if "\xa0/\xa0" not in str: 34 self.datas.append(str) 35 36 def output(self): 37 num = 1 38 for data in self.datas: 39 print("TOP"+str(num)+": " +data) 40 num+=1 41 42 if __name__ == "__main__": 43 print("豆瓣电影TOP250:python抓取") 44 myspider = doubanSpider() 45 myspider.claw()
来源:https://www.cnblogs.com/chenjz1993/p/7252057.html