项目简介
本项目整体分为三个部分来进行
- 今日头条新闻爬取
- 将爬取下来的新闻正文部分进行实体分析,并将结果可视化
- 用storm框架将爬取的新闻数据存入mysql
本文主要介绍今日头条新闻爬取的部分,下面给出整个项目的框架
由于下面可能涉及到kafka相关的东西,关于这部分内容可以参考这篇文章:流处理平台的搭建
实体分析部分可以参考:实体分析
storm流处理部分可以参考:storm流处理
项目下载地址:今日头条爬取+实体分析+storm流处理
代码介绍
main.py
程序的启动入口,这里为了能够让程序不断的执行下去用了个死循环
# -- coding: utf-8 -
from get_page_index import get_page_index
import time
def main():
get_page_index()
if __name__ == "__main__":
a=1
while True:
print('<----------------------第'+str(a)+'轮启动---------------------->\n')
main()
print('<----------------------第' + str(a) + '轮结束---------------------->\n')
print('<----------------------进入休眠---------------------->\n')
time.sleep(300)
a+=1
get_page_index.py
这个函数主要起到一个承上启下的中转的作用
import time
import random
from paser_page_index import paser_page_index
from get_ip_index import get_ip_index
from get_page_detail import get_page_detail
def get_page_index():
print('-----------------开始获取网页列表-----------------')
list=paser_page_index()
print('-----------------获取网页列表结束-----------------\n')
print('-----------------开始获取代理IP-----------------')
ip_list=get_ip_index()
print('-----------------开始爬取网页-----------------')
for i in list:
url='https://www.toutiao.com/a'+i
a=get_page_detail(url,ip_list)
time.sleep(random.randint(3, 5))
# if a==0:
# print('这是一条广告或者无法解析该网页')
# if a==1:
# print('这篇文章是个问答')
# if a==2:
# print('这是一个图片类文章')
print('-----------------爬取网页结束-----------------\n')
paser_page_index.py
这个函数的作用是获取我们需要爬取的网页的列表,以便于对列表内的网页进行爬取
import time
import requests
from bs4 import BeautifulSoup
import hashlib
#这个函数是用来计算今日头条加密算法的结果
def get_as_cp_args():
zz = {}
now = round(time.time())
e = hex(int(now)).upper()[2:] # hex()转换一个整数对象为十六进制的字符串表示
i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper() # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
if len(e) != 8:
zz = {'as': "479BB4B7254C150",
'cp': "7E0AC8874BB0985"}
return zz
n = i[:5]
a = i[-5:]
r = ""
s = ""
for i in range(5):
s = s + n[i] + e[i]
for j in range(5):
r = r + e[j + 3] + a[j]
zz = {
'as': "A1" + s + e[-3:],
'cp': e[0:3] + r + "E1"
}
return zz
def paser_page_index():
url1 = [
'https://www.toutiao.com/api/pc/feed/?category=news_hot',
'https://www.toutiao.com/api/pc/feed/?category=news_tech',
'https://www.toutiao.com/api/pc/feed/?category=news_entertainment',
'https://www.toutiao.com/api/pc/feed/?category=news_game',
'https://www.toutiao.com/api/pc/feed/?category=news_sports',
'https://www.toutiao.com/api/pc/feed/?category=news_car',
'https://www.toutiao.com/api/pc/feed/?category=news_finance',
'https://www.toutiao.com/api/pc/feed/?category=funny',
'https://www.toutiao.com/api/pc/feed/?category=news_military',
'https://www.toutiao.com/api/pc/feed/?category=news_world',
'https://www.toutiao.com/api/pc/feed/?category=news_fashion',
'https://www.toutiao.com/api/pc/feed/?category=news_travel',
'https://www.toutiao.com/api/pc/feed/?category=news_discovery',
'https://www.toutiao.com/api/pc/feed/?category=news_baby',
'https://www.toutiao.com/api/pc/feed/?category=news_regimen',
'https://www.toutiao.com/api/pc/feed/?category=news_essay',
'https://www.toutiao.com/api/pc/feed/?category=news_history',
'https://www.toutiao.com/api/pc/feed/?category=news_food'
]
list=[]
for i in url1:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
cookie = 'tt_webid=6762050087801406989; tt_webid=6762050087801406989; csrftoken=be4be279678742cea85ca2bfc0b308c8; WEATHER_CITY=%E5%8C%97%E4%BA%AC; s_v_web_id=c05bab65d1f25e1c6b72817b6f34f92a; __tasessionId=lgnzbs4ah1578017571495'
headers = {'user-agent': user_agent, 'cookie': cookie,'referer': i}
as_cp=get_as_cp_args()
url2 = '&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as='+as_cp['as']+'&cp='+as_cp['cp']
respond = requests.get(i + url2, headers=headers)
soup = BeautifulSoup(respond.text, 'html.parser')
print(soup)
try:
if respond.status_code == 200:
dict1 = respond.json()
for i in dict1['data']:
list.append(i['group_id'])
except:
None
return list
get_ip_index.py
这个程序是用来进行代理IP的爬取的,返回的结果是可用的代理IP,主要是由于我们需要长时间的对新闻进行爬取,一旦爬取时间过长,很容易IP就被封了,所以需要用到代理IP
import requests
from bs4 import BeautifulSoup
import random
def get_ip_index():
randomlist=['/nn/','/wn/','/wt/']
url='https://www.xicidaili.com'+random.choice(randomlist)+str(random.randint(1,3))
print('代理IP来源网址:',url)
list=[]
proxies={}
start=random.randint(1,40)
end=random.randint(50,90)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
r=requests.get(url,headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
tag=soup.find_all('tr')
for j in tag[start:end]:
tag1=j.find_all('td')
list.append(tag1[1].text+':'+tag1[2].text)
# 这部分是用来验证代理IP是否可用的,可以加上,也可以不加
# for i in list:
# try:
# ip="https://" + i
# # print(ip)
# proxies['https']=ip
# r=requests.get('https://www.baidu.com',headers=headers,proxies=proxies,timeout=(3,7))
# except:
# list.remove(i)
print('-----------------成功获得代理' + str(len(list)) + '个-----------------\n')
return list
get_page_detail.py
这个函数是爬取今日头条的主体函数
import requests
from bs4 import BeautifulSoup
import re
from my_kafka import kafka_produce
from get_ip_index import get_ip_index
from get_article import get_article
from text_grapher import Entity_extraction
def get_page_detail(url,ip_list):
proxies = {}
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
headers = {'user-agent': user_agent, 'x-requested-with': 'XMLHttpRequest'}
print('当前获取网页:',url)
while True:
if proxies:
try:
r = requests.get(url,headers=headers,allow_redirects=False,proxies=proxies,timeout=(3,7))
if r.status_code == 200:
break
except:
proxies['https'] = 'https://' + ip_list[0]
ip_list.pop(0)
if ip_list==[]:
ip_list=get_ip_index()
else:
r = requests.get(url, headers=headers, allow_redirects=False, timeout=(3, 7))
if r.status_code == 200:
break
else:
proxies['https'] = 'https://' + ip_list[0]
ip_list.pop(0)
r.encoding = 'utf-8'
article = {}
article['url']=url
soup = BeautifulSoup(r.text, 'html.parser')
# print(soup.prettify())
Str = soup.text
try:
type=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)
except:
return 0
if type==[] or type==['']:
return 0
if type == ['问答']:
return 1
if type == ['图片']:
return 2
#类别
article['type']=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)[0]
#标题
title_result=re.findall(re.compile(r'title: \'(.*?)\'', re.S), Str)[0]
title=re.sub(r'[\\/:*?"<>|]', '', title_result)
article['title']=title
# 时间,评论次数,来源,封面图片,关键词
article['time'] = re.findall(re.compile(r'time: \'(.*?)\'', re.S), Str)[0]
article['comments_count']=re.findall(re.compile(r'comments_count: (.*?),', re.S), Str)[0]
article['source']=re.findall(re.compile(r'source: \'(.*?)\'', re.S), Str)[0]
article['coverImg']=re.findall(re.compile(r'coverImg: \'(.*?)\'', re.S), Str)[0]
article['keywords']=re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str)
keywords=''
for i in re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str):
keywords=keywords+i+'\t'
#正文
text=get_article(r)
article['news']=text
#kafka通信
kafka_produce(str([article]).replace(""",""),url)
#实体分析
Entity_extraction(text,title.replace(""",""))
# print(article)
get_article.py
这个函数是爬取今日头条新闻正文部分的函数,主要由于今日头条的正文部分掺杂了很多网页的标签需要去掉,以及我们想要做到整个正文部分的文字连同图片的顺序不能乱,实现起来会有点复杂
from bs4 import BeautifulSoup
import re
def get_article(response):
string=''
soup = BeautifulSoup(response.text , features="lxml")
# .encode('utf-8').decode("unicode-escape")
body = soup.find('body')
script4 = body.find_all('script')
rawMeterial = re.findall("articleInfo:([\s\S]*)tagInfo:", str(script4[3])[23:][:-10])[0]
pipeiArticle = "content: '([\s\S]*)groupId:"
Article = re.findall(pipeiArticle, rawMeterial)
# print(Article)
a = Article[0].strip()
b = a.split(r'\u003Cp\u003E')
for each in b:
each2 = each.replace(r'\u003C','<').replace(r'p\u003E','p>').replace(r'\u002F','\\').replace(r'\u003E','>')
if '<\p>' in each2:
# print(each2.index('<\p>'))
each3 = each2[:each2.index('<\p>')].strip()
# print(re.sub(re.compile("<\\p>(.*?)"), "", each2))
each4 = re.sub(re.compile("<(.*?)>"), "", each3)
# print(re.sub(re.compile("<(.*?)>"), "", each3))
string=string+each4+'\n'
# print(each4)
pipeiSource = "<img src([\s\S]*)\" img_width"
pipeiSource2 = "http:([\s\S]*)"
source2 = re.findall(pipeiSource, each2)
# print(each2)
# print(source2)
if source2 != []:
# print(source2)
source3 = source2[0].split('\" img_width')
# print(source3)
for each in source3:
source4 = re.findall(pipeiSource2, each)
# print('http:' + source4[0])
string = string + str('http:' + source4[0]).strip() + '\n'
# print(source2[0][13:][:-1].strip())
# print('\n')
# pipeiSource = "<img src([\s\S]*)\" img_width"
# source2 = re.findall(pipeiSource, each2)
# if source2 != []:
# string=string+source2[0][13:][:-1].strip()+'\n'
# # print(source2[0][13:][:-1].strip())
return string.replace(""","")
my_kafka.py
这个程序是用来连接kafka的,用来将我们爬取的新闻发送到storm内
# -*- coding: utf-8 -*-
from kafka import KafkaProducer
from kafka.errors import KafkaError
KAFAKA_HOST="192.168.161.100"
KAFAKA_PORT = 9092 # 端口号
KAFAKA_TOPIC = "today_news" # topic
class Kafka_producer():
def __init__(self, kafkahost, kafkaport, kafkatopic):
self.kafkaHost = kafkahost
self.kafkaPort = kafkaport
self.kafkatopic = kafkatopic
self.producer = KafkaProducer(bootstrap_servers='{kafka_host}:{kafka_port}'.format(
kafka_host=self.kafkaHost,
kafka_port=self.kafkaPort)
)
def sendjsondata(self, params):
try:
parmas_message = params # 注意dumps
producer = self.producer
producer.send(self.kafkatopic, value=parmas_message.encode('utf-8'))
producer.flush()
except KafkaError as e:
print(e)
def kafka_produce(params,url):
# 生产模块
producer = Kafka_producer(KAFAKA_HOST, KAFAKA_PORT, KAFAKA_TOPIC)
print("======> producer:", url, '\n')
producer.sendjsondata(params)
实体分析模块的代码这里不做具体介绍
总结
整个今日头条的爬取其实整体不算困难,主要是今日头条近几天将自己的api又升级了一下,访问api的时候必须要加上as和cp两个参数,这部分解密的代码是参考的别人的代码。整体实现下来效果还是不错的,接下来我还会介绍这个项目的另外两块内容,感兴趣的可以继续关注一下。
来源:CSDN
作者:龙之焱影
链接:https://blog.csdn.net/s863222424/article/details/103817795