-
爬取前程无忧的数据(大数据职位)
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 14:47:27 2019 4 5 @author: loo 6 """ 7 8 9 import scrapy 10 import csv 11 from scrapy.crawler import CrawlerProcess 12 13 14 class MySpider(scrapy.Spider): 15 name = "spider" 16 17 def __init__(self): 18 # 保存为CSV文件操作 19 self.f = open('crawl_51jobs.csv', 'wt', newline='', encoding='GBK', errors='ignore') 20 self.writer = csv.writer(self.f) 21 '''title,locality,salary,companyName,releaseTime''' 22 self.writer.writerow(('职位', '公司地区','薪资', '公司名称', '发布时间')) 23 24 # 设置待爬取网站列表 25 self.urls = [] 26 # 设置搜索工作的关键字 key 27 key = '大数据' 28 print("关键字:", key) 29 # 设置需要爬取的网页地址 30 for i in range(1,200): 31 f_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(i) + '.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' 32 self.urls.append(f_url) 33 # print(self.urls) 34 35 36 def start_requests(self): 37 # self.init_urls() 38 for url in self.urls: 39 yield scrapy.Request(url=url, callback=self.parse) 40 # parse方法会在每个request收到response之后调用 41 42 43 def parse(self, response): 44 # 提取工作列表 45 jobs = response.xpath('//*[@id="resultList"]/div[@class="el"]') 46 # print(jobs) 47 48 for job in jobs: 49 # 工作职位 50 title = job.xpath('p/span/a/text()').extract_first().strip() 51 # 工作地区 52 locality = job.xpath('span[2]/text()').extract_first() 53 # 薪资 54 salary = job.xpath('span[3]/text()').extract_first() 55 # 公司名称 56 companyName = job.xpath('span[1]/a/text()').extract_first().strip() 57 # 发布时间 58 releaseTime = job.xpath('span[4]/text()').extract_first() 59 60 print(title, locality, salary, companyName, releaseTime) 61 # 保存数据 62 self.writer.writerow((title, locality, salary, companyName, releaseTime)) 63 # print("over: " + response.url) 64 65 66 def main(): 67 process = CrawlerProcess({ 68 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 69 }) 70 71 process.crawl(MySpider) 72 process.start() # 这句代码就是开始了整个爬虫过程 ,会输出一大堆信息,可以无视 73 74 75 if __name__=='__main__': 76 main()
-
爬取后的数据保存到CSV文件中(如下图)
-
可以在文件中观察数据的特点
- 薪资单位不一样
-
公司地区模式不一样(有的为城市,有的是城市-地区)
-
有职位信息的空白
-
清洗数据
根据CSV文件中信息的特点进行数据清洗
- 将公司位置从区域改为公司城市:地区取到城市,把区域去掉。如“上海-浦东”转化为“上海”
- 薪资规范化(源数据有的是千/月,有的是万/月):统一单位(千元/月),并且将薪资范围拆分为最低薪资和最高薪资。如将“4-6千/月”转化为:最低薪资为4,最高薪资为6
- 删除含有空值的行(有的岗位信息的工作地点、薪资等可能为空,需要删除,便于后面分析)和公司地区为“异地招聘”的行
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 14:47:27 2019 4 5 @author: loo 6 """ 7 8 9 import re 10 import csv 11 import numpy as np 12 13 14 def salaryCleaning(salary): 15 """ 16 统一薪资的单位:(千元/月); 17 将薪资范围拆分为最低薪资和最高薪资 18 """ 19 minSa, maxSa = [], [] 20 for sa in salary: 21 if sa: 22 if '-'in sa: # 针对1-2万/月或者10-20万/年的情况,包含- 23 minSalary=re.findall(re.compile('(\d*\.?\d+)'),sa)[0] 24 maxSalary=re.findall(re.compile('(\d?\.?\d+)'),sa)[1] 25 if u'万' in sa and u'年' in sa: # 单位统一成千/月的形式 26 minSalary = float(minSalary) / 12 * 10 27 maxSalary = float(maxSalary) / 12 * 10 28 elif u'万' in sa and u'月' in sa: 29 minSalary = float(minSalary) * 10 30 maxSalary = float(maxSalary) * 10 31 else: # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资 32 minSalary = re.findall(re.compile('(\d*\.?\d+)'), sa)[0] 33 maxSalary="" 34 if u'万' in sa and u'年' in sa: # 单位统一成千/月的形式 35 minSalary = float(minSalary) / 12 * 10 36 elif u'万' in sa and u'月' in sa: 37 minSalary = float(minSalary) * 10 38 elif u'元'in sa and u'天'in sa: 39 minSalary=float(minSalary)/1000*21 # 每月工作日21天 40 else: 41 minSalary = ""; maxSalary = ""; 42 43 minSa.append(minSalary); maxSa.append(maxSalary) 44 return minSa,maxSa 45 46 47 def locFormat(locality): 48 """ 49 将“地区-区域”转化为“地区” 50 """ 51 newLocality = [] 52 for loc in locality: 53 if '-'in loc: # 针对有区域的情况,包含- 54 newLoc = re.findall(re.compile('(\w*)-'),loc)[0] 55 else: # 针对没有区域的情况 56 newLoc = loc 57 newLocality.append(newLoc) 58 return newLocality 59 60 61 def readFile(): 62 """ 63 读取源文件 64 """ 65 data = [] 66 with open("crawl_51jobs.csv",encoding='gbk') as f: 67 csv_reader = csv.reader(f) # 使用csv.reader读取f中的文件 68 data_header = next(csv_reader) # 读取第一行每一列的标题 69 for row in csv_reader: # 将csv 文件中的数据保存到data中 70 data.append(row) 71 72 nd_data = np.array(data) # 将list数组转化成array数组便于查看数据结构 73 jobName = nd_data[:, 0] 74 locality = nd_data[:, 1] 75 salary = nd_data[:, 2] 76 companyName = nd_data[:, 3] 77 releaseTime = nd_data[:, 4] 78 return jobName, locality, salary, companyName, releaseTime 79 80 def saveNewFile(jobName, newLocality, minSa, maxSa, companyName, releaseTime): 81 """ 82 将清洗后的数据写入新文件 83 """ 84 new_f = open('cleaned_51jobs.csv', 'wt', newline='', encoding='GBK', errors='ignore') 85 writer = csv.writer(new_f) 86 writer.writerow(('职位', '公司城市','最低薪资(千/月)','最高薪资(千/月)', '公司名称', '发布时间')) 87 88 num = 0 89 while True: 90 try: # 所有数据都写入文件后,退出循环 91 if newLocality[num] and minSa[num] and maxSa[num] and companyName[num] and newLocality[num]!="异地招聘": # 当有空值时或者公司地点为异地招聘时不存入清洗后文件 92 writer.writerow((jobName[num], newLocality[num], minSa[num], maxSa[num], companyName[num], releaseTime[num])) 93 num += 1 94 except Exception: 95 break 96 97 98 def main(): 99 """ 100 主函数 101 """ 102 # 获取源数据 103 jobName, locality, salary, companyName, releaseTime = readFile() 104 105 # 清洗源数据中的公司地区和薪资 106 newLocality = locFormat(locality) 107 minSa, maxSa = salaryCleaning(salary) 108 109 # 将清洗后的数据存入CSV文件 110 saveNewFile(jobName, newLocality, minSa, maxSa, companyName, releaseTime) 111 112 113 if __name__ == '__main__': 114 main()
-
可视化并分析数据
-
职位数前20名的城市以及平均薪资前20的城市
-
大数据岗位的职称情况
-
大数据岗位的城市分布情况
-
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 20:15:56 2019 4 5 @author: loo 6 """ 7 8 import matplotlib.pyplot as plt 9 import csv 10 import numpy as np 11 import re 12 from wordcloud import WordCloud,STOPWORDS 13 14 15 def readFile(): 16 """ 17 读取清洗后的文件 18 """ 19 data = [] 20 with open("cleaned_51jobs.csv",encoding='gbk') as f: 21 csv_reader = csv.reader(f) # 使用csv.reader读取f中的文件 22 data_header = next(csv_reader) # 读取第一行每一列的标题 23 for row in csv_reader: # 将csv文件中的数据保存到data中 24 data.append(row) 25 26 nd_data = np.array(data) # 将list数组转化成array数组便于查看数据结构 27 jobName = nd_data[:, 0] 28 locality = nd_data[:, 1] 29 minSalary = nd_data[:, 2] 30 maxSalary = nd_data[:, 3] 31 return data, jobName, locality, minSalary, maxSalary 32 33 34 35 def salary_locality(data): 36 """ 37 计算城市对应的职位数和平均薪资,并打印 38 """ 39 city_num = dict() 40 41 for job in data: 42 loc, minSa, maxSa = job[1], float(job[2]), float(job[3]) 43 if loc not in city_num: 44 avg_salary = minSa*maxSa/2 45 city_num[loc] = (1, avg_salary) 46 else: 47 num = city_num[loc][0] 48 avg_salary = (minSa*maxSa/2 + num * city_num[loc][1])/(num+1) 49 city_num[loc] = (num+1, avg_salary) 50 51 # 将其按职位数降序排列 52 title_sorted = sorted(city_num.items(), key=lambda x:x[1], reverse=True) 53 title_sorted = dict(title_sorted) 54 55 # 将其按平均薪资降序排列 56 salary_sorted = sorted(city_num.items(), key=lambda x:x[1][1], reverse=True) 57 salary_sorted = dict(salary_sorted) 58 59 60 allCity1, allCity2, allNum, allAvg = [], [], [], [] 61 i, j = 1, 1 62 # 取职位数前20 63 for city in title_sorted: 64 if i<=20: 65 allCity1.append(city) 66 allNum.append(title_sorted[city][0]) 67 i += 1 68 69 # 取平均薪资前20 70 for city in salary_sorted: 71 if j<=20: 72 allCity2.append(city) 73 allAvg.append(salary_sorted[city][1]) 74 j += 1 75 76 #解决中文显示问题 77 plt.rcParams['font.sans-serif']=['SimHei'] 78 plt.rcParams['axes.unicode_minus'] = False 79 80 # 柱状图在横坐标上的位置 81 x = np.arange(20) 82 83 # 设置图的大小 84 plt.figure(figsize=(13, 11)) 85 86 # 列出你要显示的数据,数据的列表长度与x长度相同 87 y1 = allNum 88 y2 = allAvg 89 90 bar_width=0.8 # 设置柱状图的宽度 91 tick_label1 = allCity1 92 tick_label2 = allCity2 93 94 95 # 绘制柱状图 96 plt.subplot(211) 97 plt.title('51job——大数据职位数前20名城市') 98 plt.xlabel(u"城市") 99 plt.ylabel(u"职位数") 100 plt.xticks(x,tick_label1) # 显示x坐标轴的标签,即tick_label 101 plt.bar(x,y1,bar_width,color='salmon') 102 103 plt.subplot(212) 104 plt.title('51job——大数据职位平均薪资的前20名城市') 105 plt.xlabel(u"城市") 106 plt.ylabel(u"平均薪资(千元/月)") 107 plt.xticks(x,tick_label2) # 显示x坐标轴的标签,即tick_label 108 plt.bar(x,y2,bar_width,color='orchid') 109 110 plt.legend() # 显示图例,即label 111 # plt.savefig('city.jpg', dpi=500) # 指定像素保存 112 plt.show() 113 114 115 def jobTitle(jobName): 116 117 word="".join(jobName); 118 119 # 图片模板和字体 120 # image=np.array(Image.open('model.jpg'))#显示中文的关键步骤 121 font='simkai.ttf' 122 123 # 去掉英文,保留中文 124 resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\。\@\#\\\&\*\%\-]", " ",word) 125 # 已经中文和标点符号 126 wl_space_split = resultword 127 # 设置停用词 128 sw = set(STOPWORDS) 129 sw.add("高提成");sw.add("底薪");sw.add("五险");sw.add("双休") 130 sw.add("五险一金");sw.add("社保");sw.add("上海");sw.add("广州") 131 sw.add("无责底薪");sw.add("月薪");sw.add("急聘");sw.add("急招") 132 sw.add("资深");sw.add("包吃住");sw.add("周末双休");sw.add("代招") 133 sw.add("高薪");sw.add("高底薪");sw.add("校招");sw.add("月均") 134 sw.add("可实习");sw.add("年薪");sw.add("北京");sw.add("经理") 135 sw.add("包住");sw.add("应届生");sw.add("南京");sw.add("专员") 136 sw.add("提成");sw.add("方向") 137 138 # 关键一步 139 my_wordcloud = WordCloud(font_path=font,stopwords=sw,scale=4,background_color='white', 140 max_words = 100,max_font_size = 60,random_state=20).generate(wl_space_split) 141 #显示生成的词云 142 plt.imshow(my_wordcloud) 143 plt.axis("off") 144 plt.show() 145 146 #保存生成的图片 147 # my_wordcloud.to_file('title.jpg') 148 149 150 def localityWordCloud(locality): 151 font='simkai.ttf' 152 locality = " ".join(locality) 153 154 # 关键一步 155 my_wordcloud = WordCloud(font_path=font,scale=4,background_color='white', 156 max_words = 100,max_font_size = 60,random_state=20).generate(locality) 157 158 #显示生成的词云 159 plt.imshow(my_wordcloud) 160 plt.axis("off") 161 plt.show() 162 163 #保存生成的图片 164 # my_wordcloud.to_file('place.jpg') 165 166 167 def main(): 168 # 得到清洗后的数据数据 169 data, jobName, locality, minSalary, maxSalary = readFile() 170 # 进行分析 171 salary_locality(data) 172 jobTitle(jobName) 173 localityWordCloud(locality) 174 175 176 if __name__ == '__main__': 177 main()
可视化图片如下
-
结论
1) 职位数排名前三的城市:上海、广州、深圳
2) 平均薪资排名前三的城市:福建、滁州、三亚
3) 大数据职位需求多的城市其平均薪资不一定高。相反,大数据职位需求多的城市的平均薪资更低,而大数据职位需求少的城市的平均薪资更高。
4) 大数据岗位需求最大的职称是:开发工程师
来源:https://www.cnblogs.com/meixianhe/p/11990096.html