Python爬虫爬取拉勾网职位信息,生成Excel并通过邮件发送

北慕城南 提交于 2020-01-03 02:59:54

此代码包含了Python爬虫、Python生成Excel和Python发送邮件3部分主要功能。

利用Python,可以爬取拉勾网的职位信息,首先,通过浏览器的开发者工具,打开Network选项卡,筛选XHR类型的请求,我们可以找到拉勾网Ajax异步请求的url地址,也就是图中红框标记的位置

然后观察post参数的值,可以发现传递了3个参数,kd为搜索的关键字,pn为页码,见图中红框

 

再看返回的是json格式的数据,包含了列表页的职位信息:

打开详情页,观察页面的URL地址,发现前面部分是固定的,后面是返回的json数据中职位的positionId.html

于是可以开始爬取了,根据功能需求,分如下4个部分:第一部分是根据Ajax调用地址获取当前页返回数据的方法,第二部分是根据返回的职位信息生成Excel,其中调用了爬取详情页的方法用于获取工作地址,传入的参数为该职位的positionId,第三部分是将得到的Excel发送邮件,第四部分是main方法调用

#1、根据url发送post请求,得到返回的json格式的数据,代码如下:import requestsimport openpyxlfrom openpyxl.styles import Fontimport timeimport osimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom email.header import Headerimport math"""    kw: 职位搜索关键字    city: 城市    pageNo: 第几页"""def get_one_page(kw, city, pageNo):    url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=" + city + "&needAddtionalResult=false"    headers = {        "Cookie":"WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",        "Referer":"https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"    }    data = {        "first": "true",        "pn": pageNo,        "kd": kw    }    try:        rsp = requests.post(url, data=data, headers=headers)        if rsp.status_code == 200:            return rsp.json()    except Exception as ex:        print(ex)
# 定义获取详情页数据的方法def get_detail_page(positionId):    url = "https://www.lagou.com/jobs/{0}.html".format(positionId)    headers = {        "Cookie": "WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",        "Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"    }    try:        rsp = requests.get(url=url, headers=headers)        if rsp.status_code == 200:            return rsp.content.decode()    except Exception as ex:        print(ex)
#2、将获取到json数据得到的职位信息生成Excel:
def to_excel(json_data, filename):   # 创建workbook和sheet对象  workbook = openpyxl.Workbook()  # Workbook的开头W 大写
sheet = workbook.activeft = Font(name="宋体", size=12)positionResult = json_data#标题sheet.cell(row=1, column=1).font = ftsheet.cell(row=1, column=1).value = "公司名称"sheet.cell(row=1, column=2).font = ftsheet.cell(row=1, column=2).value = "公司标签"sheet.cell(row=1, column=3).font = ftsheet.cell(row=1, column=3).value = "公司简称"sheet.cell(row=1, column=4).font = ftsheet.cell(row=1, column=4).value = "创建时间"sheet.cell(row=1, column=5).font = ftsheet.cell(row=1, column=5).value = "地区"sheet.cell(row=1, column=6).font = ftsheet.cell(row=1, column=6).value = "学历"sheet.cell(row=1, column=7).font = ftsheet.cell(row=1, column=7).value = "职位标签"sheet.cell(row=1, column=8).font = ftsheet.cell(row=1, column=8).value = "职位诱惑"sheet.cell(row=1, column=9).font = ftsheet.cell(row=1, column=9).value = "职位名称"sheet.cell(row=1, column=10).font = ftsheet.cell(row=1, column=10).value = "薪水"sheet.cell(row=1, column=11).font = ftsheet.cell(row=1, column=11).value = "工作年限"sheet.cell(row=1, column=12).font = ftsheet.cell(row=1, column=12).value = "工作地址"#内容for index, item in enumerate(positionResult):    sheet.cell(row=index+2, column=1).font = ft    sheet.cell(row=index+2, column=1).value = item["companyFullName"]    sheet.cell(row=index+2, column=2).font = ft    sheet.cell(row=index+2, column=2).value = "".join(item["companyLabelList"])    sheet.cell(row=index+2, column=3).font = ft    sheet.cell(row=index+2, column=3).value = item["companyShortName"]    sheet.cell(row=index+2, column=4).font = ft    sheet.cell(row=index+2, column=4).value = item["createTime"]    sheet.cell(row=index+2, column=5).font = ft    sheet.cell(row=index+2, column=5).value = item["district"]    sheet.cell(row=index+2, column=6).font = ft    sheet.cell(row=index+2, column=6).value = item["education"]    sheet.cell(row=index+2, column=7).font = ft    sheet.cell(row=index+2, column=7).value = "".join(item["industryLables"])    sheet.cell(row=index+2, column=8).font = ft    sheet.cell(row=index+2, column=8).value = item["positionAdvantage"]    sheet.cell(row=index+2, column=9).font = ft    sheet.cell(row=index+2, column=9).value = item["positionName"]    sheet.cell(row=index+2, column=10).font = ft    sheet.cell(row=index+2, column=10).value = item["salary"]    sheet.cell(row=index+2, column=11).font = ft    sheet.cell(row=index+2, column=11).value = item["workYear"]    sheet.cell(row=index + 2, column=12).font = ft  #获取详情页的html    html = get_detail_page(item["positionId"])  #解析获取到的html得到详情页的工作地址    soup = BeautifulSoup(html.replace("\n", ""), 'lxml')    address = soup.find(class_="work_addr").find_all(rel="nofollow")    addressStr = ""for i in range(len(address) - 1):        if(address[i].string != None):            # print(address[i].string)addressStr += address[i].string    length = len(soup.find(class_="work_addr").contents)    sheet.cell(row=index + 2, column=12).value = addressStr + soup.find(class_="work_addr").contents[length-3].string.strip()workbook.save(filename = filename + '.xlsx')
#3、将职位信息的Excel作为附件发送邮件:# 寄信人邮箱账号和密码,替换成你自己的
sender = 'xxx@qq.com'sender_pwd = "xxx"# 接收邮件,可设置为你的QQ邮箱或者其他邮箱receivers = ['xxx@qq.com']# smtp服务器地址smtp_srv = "smtp.qq.com"# 发送附件邮件def sendMIMEMultipartEmail(fileList):    # 创建一个带附件的实例    message = MIMEMultipart()    message['From'] = Header(sender)    message['To'] = Header("拉勾网职位信息", 'utf-8')    subject = '拉勾网职位信息'    message['Subject'] = Header(subject, 'utf-8')    # 邮件正文内容    message.attach(MIMEText('拉勾网招聘职位信息', 'plain', 'utf-8'))    for item in fileList:        # 构造附件,传送fileList中的文件        att = MIMEText(open(item, 'rb').read(), 'base64', 'utf-8')        att["Content-Type"] = 'application/octet-stream'        # 这里的filename可以任意写,写什么名字,邮件中显示什么名字        att["Content-Disposition"] = 'attachment; filename=result.xlsx'        message.attach(att)    try:        smtp = smtplib.SMTP()        smtp.connect(smtp_srv, 25)        # 登录邮箱        smtp.login(sender, sender_pwd)        # 发送邮件        smtp.sendmail(sender, receivers, message.as_string())        print("邮件发送成功")    except smtplib.SMTPException as ex:        print("Error: 无法发送邮件")        print(ex)4、main方法调用:
if __name__ == "__main__":    """       本例获取城市为济南,职位搜索关键字为Python的职位列表       第一次调用传入页码1,获取总页数,       即json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"]       后面从第二页开始循环爬取    """    json_data = get_one_page("Python", "济南", 1)    positionResult = json_data["content"]["positionResult"]["result"]    positionList = positionResult    count = math.ceil(json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"])    for i in range(2, count+1):        json_data = get_one_page("Python", "济南", i)        positionResult = json_data["content"]["positionResult"]["result"]        positionList += positionResult    # print(json_data)    #print(positionList)    to_excel(positionList, "result")    time.sleep(3)    fileList = []    fileList.append(os.getcwd() + r"\result.xlsx")    sendMIMEMultipartEmail(fileList)

 完整代码:https://gitee.com/ZiSeWuDao/Python/blob/master/Spider/lagouStandAlone.py

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!