首页 > 技术文章 > Python爬虫爬取拉勾网职位信息,生成Excel并通过邮件发送

lusen1987 2018-09-30 22:29 原文

此代码包含了Python爬虫、Python生成Excel和Python发送邮件3部分主要功能。

利用Python,可以爬取拉勾网的职位信息,首先,通过浏览器的开发者工具,打开Network选项卡,筛选XHR类型的请求,我们可以找到拉勾网Ajax异步请求的url地址,也就是图中红框标记的位置

然后观察post参数的值,可以发现传递了3个参数,kd为搜索的关键字,pn为页码,见图中红框

 

再看返回的是json格式的数据,包含了列表页的职位信息:

打开详情页,观察页面的URL地址,发现前面部分是固定的,后面是返回的json数据中职位的positionId.html

于是可以开始爬取了,根据功能需求,分如下4个部分:第一部分是根据Ajax调用地址获取当前页返回数据的方法,第二部分是根据返回的职位信息生成Excel,其中调用了爬取详情页的方法用于获取工作地址,传入的参数为该职位的positionId,第三部分是将得到的Excel发送邮件,第四部分是main方法调用

#1、根据url发送post请求,得到返回的json格式的数据,代码如下:
import requests
import openpyxl
from openpyxl.styles import Font
import time
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
import math

"""
kw: 职位搜索关键字
city: 城市
pageNo: 第几页
"""
def get_one_page(kw, city, pageNo):
url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=" + city + "&needAddtionalResult=false"
headers = {
"Cookie":"WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
"Referer":"https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
data = {
"first": "true",
"pn": pageNo,
"kd": kw
}
try:
rsp = requests.post(url, data=data, headers=headers)
if rsp.status_code == 200:
return rsp.json()
except Exception as ex:
print(ex)

# 定义获取详情页数据的方法
def get_detail_page(positionId):
url = "https://www.lagou.com/jobs/{0}.html".format(positionId)
headers = {
"Cookie": "WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
"Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
try:
rsp = requests.get(url=url, headers=headers)
if rsp.status_code == 200:
return rsp.content.decode()
except Exception as ex:
print(ex)


#2、将获取到json数据得到的职位信息生成Excel:
def to_excel(json_data, filename):
# 创建workbook和sheet对象

  workbook = openpyxl.Workbook() # Workbook的开头W 大写
sheet = workbook.active
ft = Font(name="宋体", size=12)
positionResult = json_data
#标题
sheet.cell(row=1, column=1).font = ft
sheet.cell(row=1, column=1).value = "公司名称"
sheet.cell(row=1, column=2).font = ft
sheet.cell(row=1, column=2).value = "公司标签"
sheet.cell(row=1, column=3).font = ft
sheet.cell(row=1, column=3).value = "公司简称"
sheet.cell(row=1, column=4).font = ft
sheet.cell(row=1, column=4).value = "创建时间"
sheet.cell(row=1, column=5).font = ft
sheet.cell(row=1, column=5).value = "地区"
sheet.cell(row=1, column=6).font = ft
sheet.cell(row=1, column=6).value = "学历"
sheet.cell(row=1, column=7).font = ft
sheet.cell(row=1, column=7).value = "职位标签"
sheet.cell(row=1, column=8).font = ft
sheet.cell(row=1, column=8).value = "职位诱惑"
sheet.cell(row=1, column=9).font = ft
sheet.cell(row=1, column=9).value = "职位名称"
sheet.cell(row=1, column=10).font = ft
sheet.cell(row=1, column=10).value = "薪水"
sheet.cell(row=1, column=11).font = ft
sheet.cell(row=1, column=11).value = "工作年限"
sheet.cell(row=1, column=12).font = ft
sheet.cell(row=1, column=12).value = "工作地址"

#内容
for index, item in enumerate(positionResult):
sheet.cell(row=index+2, column=1).font = ft
sheet.cell(row=index+2, column=1).value = item["companyFullName"]
sheet.cell(row=index+2, column=2).font = ft
sheet.cell(row=index+2, column=2).value = "".join(item["companyLabelList"])
sheet.cell(row=index+2, column=3).font = ft
sheet.cell(row=index+2, column=3).value = item["companyShortName"]
sheet.cell(row=index+2, column=4).font = ft
sheet.cell(row=index+2, column=4).value = item["createTime"]
sheet.cell(row=index+2, column=5).font = ft
sheet.cell(row=index+2, column=5).value = item["district"]
sheet.cell(row=index+2, column=6).font = ft
sheet.cell(row=index+2, column=6).value = item["education"]
sheet.cell(row=index+2, column=7).font = ft
sheet.cell(row=index+2, column=7).value = "".join(item["industryLables"])
sheet.cell(row=index+2, column=8).font = ft
sheet.cell(row=index+2, column=8).value = item["positionAdvantage"]
sheet.cell(row=index+2, column=9).font = ft
sheet.cell(row=index+2, column=9).value = item["positionName"]
sheet.cell(row=index+2, column=10).font = ft
sheet.cell(row=index+2, column=10).value = item["salary"]
sheet.cell(row=index+2, column=11).font = ft
sheet.cell(row=index+2, column=11).value = item["workYear"]
sheet.cell(row=index + 2, column=12).font = ft
  #获取详情页的html
html = get_detail_page(item["positionId"])
  #解析获取到的html得到详情页的工作地址
soup = BeautifulSoup(html.replace("\n", ""), 'lxml')
address = soup.find(class_="work_addr").find_all(rel="nofollow")
addressStr = ""
for i in range(len(address) - 1):
if(address[i].string != None):
# print(address[i].string)
addressStr += address[i].string
length = len(soup.find(class_="work_addr").contents)
sheet.cell(row=index + 2, column=12).value = addressStr + soup.find(class_="work_addr").contents[length-3].string.strip()
workbook.save(filename = filename + '.xlsx')
#3、将职位信息的Excel作为附件发送邮件:

# 寄信人邮箱账号和密码,替换成你自己的
sender = 'xxx@qq.com'
sender_pwd = "xxx"
# 接收邮件,可设置为你的QQ邮箱或者其他邮箱
receivers = ['xxx@qq.com']
# smtp服务器地址
smtp_srv = "smtp.qq.com"
# 发送附件邮件
def sendMIMEMultipartEmail(fileList):
# 创建一个带附件的实例
message = MIMEMultipart()
message['From'] = Header(sender)
message['To'] = Header("拉勾网职位信息", 'utf-8')
subject = '拉勾网职位信息'
message['Subject'] = Header(subject, 'utf-8')
# 邮件正文内容
message.attach(MIMEText('拉勾网招聘职位信息', 'plain', 'utf-8'))

for item in fileList:
# 构造附件,传送fileList中的文件
att = MIMEText(open(item, 'rb').read(), 'base64', 'utf-8')
att["Content-Type"] = 'application/octet-stream'
# 这里的filename可以任意写,写什么名字,邮件中显示什么名字
att["Content-Disposition"] = 'attachment; filename=result.xlsx'
message.attach(att)

try:
smtp = smtplib.SMTP()
smtp.connect(smtp_srv, 25)
# 登录邮箱
smtp.login(sender, sender_pwd)
# 发送邮件
smtp.sendmail(sender, receivers, message.as_string())
print("邮件发送成功")
except smtplib.SMTPException as ex:
print("Error: 无法发送邮件")
print(ex)

4、main方法调用:
if __name__ == "__main__":
"""
本例获取城市为济南,职位搜索关键字为Python的职位列表
第一次调用传入页码1,获取总页数,
即json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"]
后面从第二页开始循环爬取
"""
json_data = get_one_page("Python", "济南", 1)
positionResult = json_data["content"]["positionResult"]["result"]
positionList = positionResult
count = math.ceil(json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"])
for i in range(2, count+1):
json_data = get_one_page("Python", "济南", i)
positionResult = json_data["content"]["positionResult"]["result"]
positionList += positionResult
# print(json_data)
#print(positionList)
to_excel(positionList, "result")
time.sleep(3)
fileList = []
fileList.append(os.getcwd() + r"\result.xlsx")
sendMIMEMultipartEmail(fileList)

 完整代码:https://gitee.com/ZiSeWuDao/Python/blob/master/Spider/lagouStandAlone.py

推荐阅读