python做数据分析非常强大和方便。我们可以随心所欲的从互联网上爬取数据并进行分析。微博评论,知乎粉丝,起点小说,甚至QQ音乐,淘宝,百度...只要你想要的数据,都有办法从网上快速获取。今天我们就从一个招聘网站上来分析下深度学习目前就业情况,网站是拉勾网,先采集拉勾网上面的数据,采集的是深度学习岗位的数据,然后用Python进行可视化。主要涉及的是爬虫&数据可视化的知识。
爬虫部分
先用Python来抓取拉勾网上面的数据,采用的是简单好用的requests模块。主要注意的地方是,拉勾网属于动态网页,所以会用到浏览器的F12开发者工具进行抓包。抓包以后会发现,其实网页是一个POST的形式,所以要提交数据,提交的数据如下图:
在上图也可以轻松发现:kd是查询关键词,pn是页数,可以实现翻页。
代码实现:
# 导入模块
import requests # 请求模块
import re
import pandas as pd
import time
import random
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false&isSchoolJob=0'
# 反爬措施
header = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Content-Length':'56',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'WEBTJ-ID=20180320203834-162436b1831df-0c9164dad219f2-32677b04-1296000-162436b1832472; _ga=GA1.2.60880077.1521549515; _gid=GA1.2.417605192.1521549515; user_trace_token=20180320203835-9b6a26df-2c3b-11e8-b549-5254005c3644; PRE_HOST=www.baidu.com; LGUID=20180320203835-9b6a2ad4-2c3b-11e8-b549-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1521549515,1521549537; LGSID=20180320203857-a85c2fb9-2c3b-11e8-902b-525400f775ce; PRE_UTM=m_cf_cpc_baidu_pc; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.a000000pGFTBfqUxhkXqJ0Zh8InHgRfUymMRWkYbaeeB5vo11AfXJ_pVu_qmI2exROnDQLds6xrXHwUMbfozjbHqncWybisdm-PZQ8oHXxZEWTb7mWrcHB0h7uAkSfVXRNfZdyVedfjPsdrnTDXoQABfHdlgx5gyEozceuAAP2MKKhIJsf.7D_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzkt52h881gE4U_-xHbHz3x5Gse5gj_L3x5I9vX8Zdtt5M33xg4mIqpknUoQQQQn-xQRze-kl-9h9menMHE_R0.U1Yk0ZDqs2v4VnL30ZKGm1Yk0Zfqs2v4VnL30A-V5HcsP0KM5y9-TZns0ZNG5yF9pywd0ZKGujYY0APGujY4nsKVIjYknjDLg1DsnH-xnW0vn-tknjc1g1nvnjD0pvbqn0KzIjYdPW00uy-b5fKBpHYznjf0UynqP1c1njcdrHn3g1Tzn1RYrHczn7t1PjTsPHTzrHmzg17xn0KkTA-b5H00TyPGujYs0ZFMIA7M5H00mycqn7ts0ANzu1Ys0ZKs5H00UMus5H08nj0snj0snj00Ugws5H00uAwETjYs0ZFJ5H00uANv5gKW0AuY5H00TA6qn0KET1Ys0AFL5HDs0A4Y5H00TLCq0ZwdT1YdPjmvn103Pjc1P161rj64PWcs0ZF-TgfqnHRznHRYrHRknHDkPsK1pyfqrjTYmWD3uWnsnj0Ymym4msKWTvYqnHbdn1uArjm4wjFArj6YnsK9m1Yk0ZK85H00TydY5H00Tyd15H00XMfqn0KVmdqhThqV5HKxn7tsg1Kxn0Kbmy4dmhNxTAk9Uh-bT1Ysg1Kxn7tsg100TA7Ygvu_myTqn0Kbmv-b5Hcvrjf1PHfdP6K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5H00uhPdIjYs0AulpjYs0Au9IjYs0ZGsUZN15H00mywhUA7M5HD0UAuW5H00mLFW5HR4PjcY%26ck%3D4736.1.103.217.567.222.564.653%26shh%3Dwww.baidu.com%26sht%3D50000021_hao_pg%26us%3D1.0.2.0.0.0.0%26ie%3Dutf-8%26f%3D8%26srcqid%3D1849000187550204706%26tn%3D50000021_hao_pg%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26oq%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rqlang%3Dcn%26sc%3DUWd1pgw-pA7EnHc1FMfqnHRznHfvnWc3PWcsnzuW5y99U1Dznzu9m1Y1nWDYP1mdnj6%26ssl_sample%3Ds_4%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_sz_e110f9_d2162e_%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591; X_HTTP_TOKEN=6a13991d26c497c534f7525071e2f2f7; ab_test_random_num=0; _putrc=05603F16025450A9123F89F2B170EADC; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B79375; JSESSIONID=ABAAABAAAIAACBI9E8DA2EBAFAF29B4573467BDB7BC16A6; hasDeliver=0; gate_login_token=7e648bb18be869b44f4094c6b2aa58f7085253d309f60329a51e38a80e1225c7; index_location_city=%E6%B7%B1%E5%9C%B3; hideSliderBanner20180305WithTopBannerC=1; TG-TRACK-CODE=index_search; SEARCH_ID=a64678b0a3324aebb0a003b941b6609c; LGRID=20180320204245-3010db62-2c3c-11e8-b549-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1521549765',
'Host':'www.lagou.com',
'Origin':'https://www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0?labelWords=&fromSearch=true&suginput=',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest'
}
for n in range(30):
# 注意:抓取数据的时候不要爬取太快,除非你有其他的反爬措施,
# 比如更换IP等,另外不需登录,我在代码加入了time模块,用于限制爬取速度。
time.sleep(random.randint(2,5))
# 字典json
form = {
'first':'false',
'pn':str(n),
'kd':'深度学习'
}
# 请求数据
html = requests.post(url,data=form,headers = header)
data = re.findall('{"companyId":.*?,"positionName":"(.*?)","workYear":"(.*?)","education":"(.*?)","jobNature":"(.*?)","financeStage":"(.*?)","companyLogo":".*?","industryField":".*?","city":"(.*?)","salary":"(.*?)","positionId":.*?,"positionAdvantage":"(.*?)","companyShortName":"(.*?)","district"',html.text)
data2 = pd.DataFrame(data)
data2.to_csv(r'/Users/gavin/PycharmProjects/jupyter_test/lagou_data.csv',header=False,index=False,mode='a+')
数据可视化
下载下来的数据长成这个样子:
导入模块并配置绘图风格
import pandas as pd # 数据框操作
import numpy as np
import matplotlib.pyplot as plt # 绘图
import jieba # 分词
from wordcloud import WordCloud # 词云可视化
import matplotlib as mpl # 配置字体
from pyecharts import Geo # 地理图
mpl.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
# 配置绘图风格
plt.rcParams["axes.labelsize"] = 16.
plt.rcParams["xtick.labelsize"] = 14.
plt.rcParams["ytick.labelsize"] = 14.
plt.rcParams["legend.fontsize"] = 12.
plt.rcParams["figure.figsize"] = [15., 15.]
数据预览
# 导入数据
data = pd.read_csv(r'/Users/gavin/PycharmProjects/jupyter_test/lagou_data.csv',encoding='utf-8') # 导入数据
data.head()
注意:以上列名是我加上去的
学历要求
data['学历要求'].value_counts().plot(kind='barh',rot=0)
plt.show
工作经验
Python热门岗位
这里我重新生成了新的数据,搜索python相关的。
final = ''
stopwords = ['python','Python','工程师','(',')','/'] # 停止词
for n in range(data.shape[0]):
seg_list = list(jieba.cut(data['岗位名称'][n]))
for seg in seg_list:
if seg not in stopwords:
final = final + seg +''
my_wordcloud = WordCloud(font_path=r'/Library/Fonts/SimHei.ttf',width=1000,height=600).generate(final)
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show()
来源:CSDN
作者:山水之间2018
链接:https://blog.csdn.net/Gavinmiaoc/article/details/79646036