前言
爬取内容和策略
这里先从简单开始,先爬取某个大V的微博。举例:爬取微博名为"思想聚焦"(微博ID: 1742566624),微博内容都是心灵鸡汤,还是挺受用的~
- 基于模拟登陆来保存cookie,抓取指定web内容。 未登录新浪微博的情况下,是可以通过网址查看一个用户的首页,但是不能进一步查看该用户的关注和粉丝等信息,如果点击关注和粉丝,就会重定向回到登录页面。因此通过selenium模拟登陆保存cookie,在请求抓取其他页面时在headers中加入cookie信息,就可以抓取了。
这里需要通过selenium webdriver启动浏览器Firefox,因此系统需要安装Firefox,另外需要下载驱动geckodrive,下载地址为: https://github.com/mozilla/geckodriver/releases/
import osimport timeimport picklefrom tqdm import *from selenium import webdriverfrom web_crawl.weibo_crawl.settings import config# 方便完全加载登录页面def count_time(): for i in tqdm(range(40)): time.sleep(0.5)driver = webdriver.Firefox(executable_path = "C:drivergeckodriver.exe")driver.set_window_size(1640, 688)driver.get(config.LOGIN_URL)# 在获取elment之前等待4s,等待页面渲染count_time()# 登录driver.find_element_by_xpath('//input[@id="loginName"]').send_keys(config.ACCOUNT_ID)driver.find_element_by_xpath('//input[@id="loginPassword"]').send_keys(config.ACCOUNT_PASSWORD)print('account id: {}'.format(config.ACCOUNT_ID))print('account password: {}'.format(config.ACCOUNT_PASSWORD))driver.find_element_by_xpath('//a[@id="loginAction"]').click()# 获取cookie,并保存cookie。def save_cookile(): try: cookie_list = driver.get_cookies() print(cookie_list) cookie_string = '' for cookie in cookie_list: if 'name' in cookie and 'value' in cookie: cookie_string += cookie['name'] + '=' + cookie['value'] + ';' print(cookie_string) if 'SSOLoginState' in cookie_string: print("成功获取cookie! {}".format(cookie_string)) if os.path.exists(config.COOKIE_SAVE_PATH): os.remove(config.COOKIE_SAVE_PATH) cookie_dict = {} cookie_dict[config.ACCOUNT_ID] = cookie_string with open(config.COOKIE_SAVE_PATH, "wb") as f: pickle.dump(cookie_dict, f) print("成功保存cookie到文件{} ".format(config.COOKIE_SAVE_PATH)) except Exception as e: print(e)if __name__ == '__main__': save_cookile()
- 确定爬取内容
- 1)爬取微博的基本信息,获取微博名、关注数、总微博数、粉丝数以及页数
- 注意爬取xpath路径: 例如爬取微博名: //div/table//div/span[@class='ctt']/text()
# 获取微博的基本信息 def get_weibo_baisc_info(self): crawl_url = 'http://weibo.cn/%s?filter=%s&page=1' % (self.scrap_id, self.filter_flag) print("抓取的页面是: {}".format(crawl_url)) html = requests.get(crawl_url, cookies=self.cookie, headers=self.headers).content # 获取微博内容 print(" " + "-" * 30) print("准备获取微博内容:") selector = etree.HTML(html) try: # 获取微博名 self.user_name = selector.xpath("//div/table//div/span[@class='ctt']/text()")[0] # print("user_name: ", user_name) # 获取微博其他信息 # 总微博数 weibo_num = selector.xpath("//div/span[@class='tc']/text()")[0] self.weibo_num = pattern.findall(weibo_num)[0] # print("weibo_num: ", weibo_num) # 关注数 gz_num = selector.xpath("//div[@class='tip2']/a/text()")[0] self.gz_num = pattern.findall(gz_num)[0] # print("gz_num: ", gz_num) # 粉丝数 fs_num = selector.xpath("//div[@class='tip2']/a/text()")[1] self.fs_num = pattern.findall(fs_num)[0] # print("fs_num: ", fs_num) print('当前新浪微博用户{}已经发布的微博数为{}, 他目前关注{}了微博用户, 粉丝数有 {}'.format(self.user_name, self.weibo_num, self.gz_num, self.fs_num)) if selector.xpath("//*[@id='pagelist']/form/div/input[1]") is None: page_num = 1 else: # page_num = list(selector.xpath("//*[@id='pagelist']/form/div/input[1]")[0].attrib.iteritems()) # [('name', 'mp'), ('type', 'hidden'), ('value', '14483')] # 注意抓取的是字符类型 self.page_num = int(selector.xpath("//*[@id='pagelist']/form/div/input[1]")[0].attrib["value"]) print("总共的微博页数: ", self.page_num) return self.user_name, self.weibo_num, self.gz_num, self.fs_num, self.page_num except Exception as e: logging.error(e)
- 爬取微博内容
- 注意到爬取微博内容,会遇到"全文"展开问题,做法是先去监测每条微博是否包含"全文",如果有,则对应抓取链接信息,然后基于链接,再抓取全文内容。 注意在遇到“全文”展开爬取xpath路径: 例如如下微博内容:
- 对应的DOM为:
- 做法:
- 1) 获取包含“全文”的xpath路径,返回list
- "div/span[@class='ctt']/a/text()"
- 2) 判断是否包含“全文”,若包含,则提取全文对应的链接,然后再通过链接去解析获取微博内容。
if "全文" in quanwen_string: index = quanwen_string.index("全文") # print(index) quanwen_url = content[i].xpath("div/span[@class='ctt']/a[%d]/@href" % (index+1))[0] # print(quanwen_url) quanwen_url = "https://weibo.cn" + quanwen_url # print(quanwen_url) html_quanwen = requests.get(url=quanwen_url, cookies=self.cookie, headers=self.headers).content selector_quanwen = etree.HTML(html_quanwen) weibo_text = selector_quanwen.xpath("//div/div/span[@class='ctt']")[0] # weibo_text = weibo_text.xpath("text()")[0] weibo_text = "".join(weibo_text.xpath("text()"))
具体的爬取微博内容的整体代码如下:
def get_weibo_content(self): # # 获取微博基本信息 # user_name, weibo_num, gz_num, fs_num, total_page_num = self.get_weibo_baisc_info() total_page_num = self.page_num try: start_page = 0 # 判断之前是否抓取过,若抓取过判断是否可以增量抓取 if os.path.exists(self.weibo_content_save_file): with open(self.weibo_content_save_file, "rb") as f: content_dict = pickle.load(f) if self.scrap_id in content_dict.keys(): wb_content = content_dict[self.scrap_id]['weibo_content'] start_page = content_dict[self.scrap_id]['last_scrap_page'] # 总页数大于上次之前抓取的页数,因为最新的微博是在第一页,因此抓取的区间位于[0, page_num - start_page] if total_page_num >= start_page: print("之前已经抓取过,现在开始增量抓取。。。") start_page = 0 page_num = total_page_num - start_page else: page_num = total_page_num else: page_num = total_page_num # 开始进行抓取 try: # for page in range(start_page + 1, page_num + 1): for page in range(start_page + 1, page_num + 1): url = 'http://weibo.cn/%s?filter=%s&page=%s' % (str(self.scrap_id), str(self.filter_flag), str(page)) html_other = requests.get(url=url, cookies=self.cookie, headers=self.headers).content selector_other = etree.HTML(html_other) content = selector_other.xpath("//div[@class='c']") print("***************************************************") print("当前解析的是第{}页,总共{}页".format(page, page_num)) # 每5页暂停一会,防止被禁 if page % 5 == 0: print("等待{}s,以免微博被禁!".format(self.rest_time)) time.sleep(self.rest_time) # 只有10条数据,但是抓取有12条数据,因此需要进行删除 if len(content) > 3: for i in range(0, len(content) - 2): # 抓取的微博条数 self.weibo_scraped += 1 # 获取加密后的id, 方便后续提取评论等数据 detail = content[i].xpath("@id")[0] comment_url = 'http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.scrap_id) self.weibo_comment_detail_urls.append(comment_url) # print("div/a/text(): ", content[i].xpath("div/a/text()")) # div/a/text(): ['赞[15]', '转发[4]', '评论[8]', '收藏'] # 点赞数 num_zan = content[i].xpath('div/a/text()')[-4] num_zan = pattern.findall(num_zan)[0] self.weibo_num_zan_list.append(num_zan) # 转发数 num_forward = content[i].xpath('div/a/text()')[-3] num_forward = pattern.findall(num_forward)[0] self.weibo_num_forward_list.append(num_forward) # 评论数 num_comment = content[i].xpath('div/a/text()')[-2] num_comment = pattern.findall(num_comment)[0] self.weibo_num_comment_list.append(num_comment) # 判断全文是否展开 quanwen_string = content[i].xpath("div/span[@class='ctt']/a/text()") # print("quanwen_string: ", quanwen_string) if "全文" in quanwen_string: index = quanwen_string.index("全文") # print(index) quanwen_url = content[i].xpath("div/span[@class='ctt']/a[%d]/@href" % (index+1))[0] # print(quanwen_url) quanwen_url = "https://weibo.cn" + quanwen_url # print(quanwen_url) html_quanwen = requests.get(url=quanwen_url, cookies=self.cookie, headers=self.headers).content selector_quanwen = etree.HTML(html_quanwen) weibo_text = selector_quanwen.xpath("//div/div/span[@class='ctt']")[0] # weibo_text = weibo_text.xpath("text()")[0] weibo_text = "".join(weibo_text.xpath("text()")) self.weibo_content.append(weibo_text) # print("1") print("weibo_text: ", weibo_text) # print("DONE!") else: weibo_text = content[i].xpath("div/span[@class='ctt']")[0] # 获取当前节点文本 weibo_text = weibo_text.xpath("string(.)") self.weibo_content.append(weibo_text) # print(2) print("weibo_text: ", weibo_text) # print("DONE!") self.save_weibo_content(self.user_name, self.weibo_content, page) except etree.XMLSyntaxError as e: print("*" * 20) print('=' * 20) print("微博用户{}的所有微博已经爬取!".format(self.user_name)) print("总共发了{}条微博,总的点赞数{},总的转发数{},总的收藏数{}".format(len(self.weibo_content), np.sum(self.weibo_num_zan_list), np.sum(self.weibo_num_forward_list), np.sum(self.weibo_num_comment_list))) # 保存微博内容 self.save_weibo_content(self.user_name, self.weibo_content, total_page_num) except Exception as e: logging.error(e) print(' ' * 2) print('=' * 20) print('微博用户 {} 出现内容抓取错误 {}.'.format(self.user_name, e)) print("总共发了{}条微博,总的点赞数{},总的转发数{},总的收藏数{}".format(len(self.weibo_content), np.sum(self.weibo_num_zan_list), np.sum(self.weibo_num_forward_list), np.sum(self.weibo_num_comment_list))) print('现在尝试保存微博内容...') self.save_weibo_content(self.user_name, self.weibo_content, page) print(' ' * 2) print('=' * 20) print("总共发了{}条微博,总的点赞数{},总的转发数{},总的收藏数{}".format(len(self.weibo_content), np.sum(self.weibo_num_zan_list), np.sum(self.weibo_num_forward_list), np.sum(self.weibo_num_comment_list))) print('尝试保存微博内容...') # self.save_weibo_content(self.user_name, self.weibo_content, page) del self.weibo_content if self.filter_flag == 0: print('共' + str(self.weibo_scraped) + '条微博') else: print('共' + str(self.weibo_num) + '条微博,其中' + str(self.weibo_scraped) + '条为原创微博') except IndexError as e: print('已经获取完微博信息, 当前微博用户{}还没有发布微博.'.format(self.scrap_id)) except KeyboardInterrupt: print('手动中止... 现在保存微博内容!') self.save_weibo_content(self.user_name, self.weibo_content, page - 1)
爬取微博内容的结果如下:
- 爬取微博内容和评论
# 抓取微博正文和评论并保存到mongodb中 def get_content_and_comment_to_db(self, limit=10): # 开始进行抓取, 出于简单考虑这里不考虑抓取过 start_page = 0 try: for page in range(start_page + 1, self.page_num + 1): url = 'http://weibo.cn/%s?filter=%s&page=%s' % (str(self.scrap_id), str(self.filter_flag), str(page)) html_other = requests.get(url=url, cookies=self.cookie, headers=self.headers).content selector_other = etree.HTML(html_other) content = selector_other.xpath("//div[@class='c']") print("***************************************************") print("当前解析的是第{}页,总共{}页".format(page, self.page_num)) # 每5页暂停一会,防止被禁 if page % 5 == 0: print("等待{}s,以免微博被禁!".format(self.rest_time)) time.sleep(self.rest_time) # 只有10条数据,但是抓取有12条数据,因此需要进行删除 if len(content) > 3: for i in range(0, len(content) - 2): # 抓取的微博条数 self.weibo_scraped += 1 # 获取加密后的id, 方便后续提取评论等数据 detail = content[i].xpath("@id")[0] comment_url = 'http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.scrap_id) self.weibo_comment_detail_urls.append(comment_url) # 点赞数 num_zan = content[i].xpath('div/a/text()')[-4] num_zan = pattern.findall(num_zan)[0] self.weibo_num_zan_list.append(num_zan) # 转发数 num_forward = content[i].xpath('div/a/text()')[-3] num_forward = pattern.findall(num_forward)[0] self.weibo_num_forward_list.append(num_forward) # 评论数 num_comment = content[i].xpath('div/a/text()')[-2] num_comment = pattern.findall(num_comment)[0] self.weibo_num_comment_list.append(num_comment) # 判断全文是否展开 quanwen_string = content[i].xpath("div/span[@class='ctt']/a/text()") if "全文" in quanwen_string: index = quanwen_string.index("全文") quanwen_url = content[i].xpath("div/span[@class='ctt']/a[%d]/@href" % (index + 1))[0] quanwen_url = "https://weibo.cn" + quanwen_url html_quanwen = requests.get(url=quanwen_url, cookies=self.cookie, headers=self.headers).content selector_quanwen = etree.HTML(html_quanwen) weibo_text = selector_quanwen.xpath("//div/div/span[@class='ctt']")[0] weibo_text = "".join(weibo_text.xpath("text()")) self.weibo_content.append(weibo_text) else: weibo_text = content[i].xpath("div/span[@class='ctt']")[0] # 获取当前节点文本 weibo_text = weibo_text.xpath("string(.)") self.weibo_content.append(weibo_text) # 抓取评论数据 print("正在获取对应的评论数据。。。") content_and_comment_dict = {} print("开始从{}解析微博评论:".format(comment_url)) html_detail = requests.get(comment_url, cookies=self.cookie, headers=self.headers).content selector_detail = etree.HTML(html_detail) # 如果当前微博没有评论,跳过它 if selector_detail.xpath("//div[@id='pagelist']//div/input[1]/@value") is None: continue else: all_comment_pages = int(selector_detail.xpath("//div[@id='pagelist']//div/input[1]/@value")[0]) print(all_comment_pages) print('这是{}的微博:'.format(self.user_name)) print('微博内容: {}'.format(weibo_text)) print('接下来是下面的评论: ') content_and_comment_dict["content"] = weibo_text content_and_comment_dict["comment"] = [] content_and_comment_dict["url"] = comment_url # start_idx = 0 # 限制抓取指定数量的评论 end_idx = all_comment_pages - 2 if end_idx > limit: end_idx = limit for page in range(1, end_idx): print("当前解析的页面是{}, 总页面{}。".format(page, end_idx)) # 每隔5页,稍微暂停 if page % 5 == 0: rest_time = np.random.randint(self.rest_min_time, self.rest_max_time) time.sleep(rest_time) # 从第二页开始爬取,第一页有一些噪音 detail_comment_url = comment_url + "&page=" + str(page + 1) print(detail_comment_url) # 开始解析页面 html_detail_page = requests.get(url=detail_comment_url, cookies=self.cookie, headers=self.headers).content selector_comment_detail = etree.HTML(html_detail_page) # starts-with 顾名思义,匹配一个属性开始位置的关键字; contains匹配一个属性值中包含的字符串 comment_list = selector_comment_detail.xpath("//div[starts-with(@id, 'C_')]") for comment in comment_list: single_comment_user_name = comment.xpath("a[1]/text()")[0] # count: Returns the number of nodes for a given XPath 返回指定xpath的节点数 if comment.xpath('span[1][count(*)=0]'): single_comment_content = comment.xpath('span[1][count(*)=0]/text()')[0] else: span_element = comment.xpath('span[1]')[0] at_user_name = span_element.xpath('a/text()')[0] at_user_name = '$' + at_user_name.split('@')[-1] + '$' single_comment_content = span_element.xpath('/text()') single_comment_content.insert(1, at_user_name) single_comment_content = ' '.join(single_comment_content) full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content print(full_single_comment) content_and_comment_dict['comment'].append(full_single_comment) content_and_comment_dict['last_idx'] = page mongodb.insert(content_and_comment_dict) except Exception as e: logging.error('在获取微博内容和评论的过程中抛出异常, error:', e) print(' ' * 2) print('=' * 20)
爬取微博内容和评论的结果如下:
准备获取微博内容:当前新浪微博用户思想聚焦已经发布的微博数为73166, 他目前关注1702了微博用户, 粉丝数有 25231899总共的微博页数: 7361***************************************************当前解析的是第1页,总共7361页正在获取对应的评论数据。。。开始从http://weibo.cn/comment/EwLwbivqE?uid=1742566624&rl=0解析微博评论:3891这是思想聚焦的微博:微博内容: 嗯 夏天结束了接下来是下面的评论:当前解析的页面是1, 总页面10。http://weibo.cn/comment/GxUMXmsCT?uid=1742566624&rl=0&page=2<南风喵喵_>: 就是说 春天快来了么[喵喵]<千诱于野>: 今晚月色真美<假如你也是一只猫>: 嗯 夏天结束了<qiayihun>: 但是秋天的温度真的舒服 秋天说不定恋爱才刚刚开始[doge]<Smores_>: 戏足<七月的流萤照冷>: 那冬天结束了的意思就是要死了?
mongodb的数据存储如下:
总结: 目前抓取指定id的7350多页的微博内容没有遇到任何问题。后面会基于爬取的内容构建高质量的聊天语料,