1 #requests+正则爬取豆瓣图书 2 3 import requests 4 import re 5 6 def get_html(url): 7 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER'} 8 response = requests.get(url,headers=headers) 9 html = response.text 10 return html 11 12 13 def get_books(url): 14 15 html = get_html(url) 16 pattern = re.compile(r'<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S) 17 result = re.findall(pattern,html) 18 for rs in result: 19 link,book,name,data = rs 20 book = re.sub('\s','',book)#可用sub去掉换行空白等 21 22 print(link,book,name.strip(),data.strip())#也可用strip去掉换行空白 23 24 25 if __name__ == '__main__': 26 27 url = 'https://book.douban.com/' 28 get_books(url)
来源:https://www.cnblogs.com/themost/p/6851736.html