此代码包含了Python爬虫、Python生成Excel和Python发送邮件3部分主要功能。
利用Python,可以爬取拉勾网的职位信息,首先,通过浏览器的开发者工具,打开Network选项卡,筛选XHR类型的请求,我们可以找到拉勾网Ajax异步请求的url地址,也就是图中红框标记的位置
然后观察post参数的值,可以发现传递了3个参数,kd为搜索的关键字,pn为页码,见图中红框
再看返回的是json格式的数据,包含了列表页的职位信息:
打开详情页,观察页面的URL地址,发现前面部分是固定的,后面是返回的json数据中职位的positionId.html
于是可以开始爬取了,根据功能需求,分如下4个部分:第一部分是根据Ajax调用地址获取当前页返回数据的方法,第二部分是根据返回的职位信息生成Excel,其中调用了爬取详情页的方法用于获取工作地址,传入的参数为该职位的positionId,第三部分是将得到的Excel发送邮件,第四部分是main方法调用
#1、根据url发送post请求,得到返回的json格式的数据,代码如下:import requestsimport openpyxlfrom openpyxl.styles import Fontimport timeimport osimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom email.header import Headerimport math""" kw: 职位搜索关键字 city: 城市 pageNo: 第几页"""def get_one_page(kw, city, pageNo): url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=" + city + "&needAddtionalResult=false" headers = { "Cookie":"WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782", "Referer":"https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" } data = { "first": "true", "pn": pageNo, "kd": kw } try: rsp = requests.post(url, data=data, headers=headers) if rsp.status_code == 200: return rsp.json() except Exception as ex: print(ex)
# 定义获取详情页数据的方法def get_detail_page(positionId): url = "https://www.lagou.com/jobs/{0}.html".format(positionId) headers = { "Cookie": "WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782", "Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" } try: rsp = requests.get(url=url, headers=headers) if rsp.status_code == 200: return rsp.content.decode() except Exception as ex: print(ex)
#2、将获取到json数据得到的职位信息生成Excel:
def to_excel(json_data, filename): # 创建workbook和sheet对象 workbook = openpyxl.Workbook() # Workbook的开头W 大写
sheet = workbook.activeft = Font(name="宋体", size=12)positionResult = json_data#标题sheet.cell(row=1, column=1).font = ftsheet.cell(row=1, column=1).value = "公司名称"sheet.cell(row=1, column=2).font = ftsheet.cell(row=1, column=2).value = "公司标签"sheet.cell(row=1, column=3).font = ftsheet.cell(row=1, column=3).value = "公司简称"sheet.cell(row=1, column=4).font = ftsheet.cell(row=1, column=4).value = "创建时间"sheet.cell(row=1, column=5).font = ftsheet.cell(row=1, column=5).value = "地区"sheet.cell(row=1, column=6).font = ftsheet.cell(row=1, column=6).value = "学历"sheet.cell(row=1, column=7).font = ftsheet.cell(row=1, column=7).value = "职位标签"sheet.cell(row=1, column=8).font = ftsheet.cell(row=1, column=8).value = "职位诱惑"sheet.cell(row=1, column=9).font = ftsheet.cell(row=1, column=9).value = "职位名称"sheet.cell(row=1, column=10).font = ftsheet.cell(row=1, column=10).value = "薪水"sheet.cell(row=1, column=11).font = ftsheet.cell(row=1, column=11).value = "工作年限"sheet.cell(row=1, column=12).font = ftsheet.cell(row=1, column=12).value = "工作地址"#内容for index, item in enumerate(positionResult): sheet.cell(row=index+2, column=1).font = ft sheet.cell(row=index+2, column=1).value = item["companyFullName"] sheet.cell(row=index+2, column=2).font = ft sheet.cell(row=index+2, column=2).value = "".join(item["companyLabelList"]) sheet.cell(row=index+2, column=3).font = ft sheet.cell(row=index+2, column=3).value = item["companyShortName"] sheet.cell(row=index+2, column=4).font = ft sheet.cell(row=index+2, column=4).value = item["createTime"] sheet.cell(row=index+2, column=5).font = ft sheet.cell(row=index+2, column=5).value = item["district"] sheet.cell(row=index+2, column=6).font = ft sheet.cell(row=index+2, column=6).value = item["education"] sheet.cell(row=index+2, column=7).font = ft sheet.cell(row=index+2, column=7).value = "".join(item["industryLables"]) sheet.cell(row=index+2, column=8).font = ft sheet.cell(row=index+2, column=8).value = item["positionAdvantage"] sheet.cell(row=index+2, column=9).font = ft sheet.cell(row=index+2, column=9).value = item["positionName"] sheet.cell(row=index+2, column=10).font = ft sheet.cell(row=index+2, column=10).value = item["salary"] sheet.cell(row=index+2, column=11).font = ft sheet.cell(row=index+2, column=11).value = item["workYear"] sheet.cell(row=index + 2, column=12).font = ft #获取详情页的html html = get_detail_page(item["positionId"]) #解析获取到的html得到详情页的工作地址 soup = BeautifulSoup(html.replace("\n", ""), 'lxml') address = soup.find(class_="work_addr").find_all(rel="nofollow") addressStr = ""for i in range(len(address) - 1): if(address[i].string != None): # print(address[i].string)addressStr += address[i].string length = len(soup.find(class_="work_addr").contents) sheet.cell(row=index + 2, column=12).value = addressStr + soup.find(class_="work_addr").contents[length-3].string.strip()workbook.save(filename = filename + '.xlsx')
#3、将职位信息的Excel作为附件发送邮件:# 寄信人邮箱账号和密码,替换成你自己的
sender = 'xxx@qq.com'sender_pwd = "xxx"# 接收邮件,可设置为你的QQ邮箱或者其他邮箱receivers = ['xxx@qq.com']# smtp服务器地址smtp_srv = "smtp.qq.com"# 发送附件邮件def sendMIMEMultipartEmail(fileList): # 创建一个带附件的实例 message = MIMEMultipart() message['From'] = Header(sender) message['To'] = Header("拉勾网职位信息", 'utf-8') subject = '拉勾网职位信息' message['Subject'] = Header(subject, 'utf-8') # 邮件正文内容 message.attach(MIMEText('拉勾网招聘职位信息', 'plain', 'utf-8')) for item in fileList: # 构造附件,传送fileList中的文件 att = MIMEText(open(item, 'rb').read(), 'base64', 'utf-8') att["Content-Type"] = 'application/octet-stream' # 这里的filename可以任意写,写什么名字,邮件中显示什么名字 att["Content-Disposition"] = 'attachment; filename=result.xlsx' message.attach(att) try: smtp = smtplib.SMTP() smtp.connect(smtp_srv, 25) # 登录邮箱 smtp.login(sender, sender_pwd) # 发送邮件 smtp.sendmail(sender, receivers, message.as_string()) print("邮件发送成功") except smtplib.SMTPException as ex: print("Error: 无法发送邮件") print(ex)4、main方法调用:
if __name__ == "__main__": """ 本例获取城市为济南,职位搜索关键字为Python的职位列表 第一次调用传入页码1,获取总页数, 即json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"] 后面从第二页开始循环爬取 """ json_data = get_one_page("Python", "济南", 1) positionResult = json_data["content"]["positionResult"]["result"] positionList = positionResult count = math.ceil(json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"]) for i in range(2, count+1): json_data = get_one_page("Python", "济南", i) positionResult = json_data["content"]["positionResult"]["result"] positionList += positionResult # print(json_data) #print(positionList) to_excel(positionList, "result") time.sleep(3) fileList = [] fileList.append(os.getcwd() + r"\result.xlsx") sendMIMEMultipartEmail(fileList)
完整代码:https://gitee.com/ZiSeWuDao/Python/blob/master/Spider/lagouStandAlone.py
来源:https://www.cnblogs.com/lusen1987/p/9733529.html