算法运行订单的spark

ぐ巨炮叔叔 提交于 2019-12-14 21:35:43

 

su hdfs

cd /var/lib/hadoop-hdfs/bowen_git/algo-offline-job/person_classification

PYTHONPATH=./ /usr/bin/spark-submit --queue A ./opay_order_info.py prod

 

# -*- coding: utf-8 -*-
import sys
import time
import json
from init_spark import init_spark
import csv
from util import *
from datetime import datetime
import math


passenger_order_query = '''
select
t3.phone_number,
last_order_city_id,
first_create_time,
last_create_time,
first_finish_time,
last_finish_time,
if(total_create_num is not null,total_create_num,0) as total_create_num,
if(total_finish_num is not null,total_finish_num,0) as total_finish_num,
if(total_price is not null,total_price,0) as total_price,
total_price/total_finish_num as avg_will_pay,
ceil((unix_timestamp()-last_finish_time)/86400) as last_finish_day,
ceil((last_finish_time-first_finish_time)/86400)/total_finish_num as finish_frequency,
ceil((last_create_time-first_create_time)/86400)/total_create_num as create_frequency,
oride_finish_num,
otrike_carpool_finish_num,
otrike_charter_finish_num,
ocar_finish_num
from
(
select substr(mobile,-10,10) as phone_number
from opay_dw_ods.ods_sqoop_base_user_di
where dt='{dt}'
) as t3
left join
(
select
phone_number,
min(first_create_time) as first_create_time,
max(last_create_time) as last_create_time,
min(first_finish_time) as first_finish_time,
max(last_finish_time) as last_finish_time,
sum(total_create_num) as total_create_num,
sum(total_finish_num) as total_finish_num,
sum(total_price) as total_price,
max(last_order_city_id) as last_order_city_id,
sum(oride_finish_num) as oride_finish_num,
sum(otrike_carpool_finish_num) as otrike_carpool_finish_num,
sum(otrike_charter_finish_num) as otrike_charter_finish_num,
sum(ocar_finish_num) as ocar_finish_num
from
(
select passenger_id,substr(phone_number,-10,10) as phone_number
from oride_dw.dim_oride_passenger_base
where dt = '{dt}'
) as t1
left join
(
select passenger_id,
min(create_time) as first_create_time,
max(create_time) as last_create_time,
min(case when status in (4,5) then if(status=4,arrive_time,finish_time) end) as first_finish_time,
max(case when status in (4,5) then if(status=4,arrive_time,finish_time) end) as last_finish_time,
count(*) as total_create_num,
max(city_id) as last_order_city_id,
sum(if(status in (4,5) and product_id = 3 and is_carpool=0,1,0)) as otrike_charter_finish_num,
sum(if(status in (4,5) and product_id = 4,1,0)) as ocar_finish_num
from oride_dw.dwd_oride_order_base_include_test_di
group by passenger_id
) as t2
on t1.passenger_id=t2.passenger_id
group by phone_number
) as t4
on t3.phone_number=t4.phone_number
'''

def insert_passenger_data(hql,map_fuc,dt):
# results = spark.sql(hql.format(dt=dt)) \
# .rdd.map(map_fuc).filter(lambda x: x is not None).\
# groupByKey().mapValues(list).map(handler_map).\
# cache()
# first_order = spark.sql(passenger_first_order_query.format(dt=dt)).\
# rdd.map(first_order_map).filter(lambda x: x is not None).cache()
# res = results.join(first_order).\
# last_finish_time,total_create_num,total_finish_num,total_price))):
# last_finish_time,total_create_num,total_finish_num,total_price)).\
# map(get_json_data).cache()
res = spark.sql(hql.format(dt=dt)) \
.rdd.map(map_fuc).filter(lambda x: x is not None). \
cache()
res.repartition(10).saveAsTextFile(
path='ufile://opay-datalake/oride-research/total_passenger_finish_info/opay/dt=%s/' % dt,
compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
spark.sql('msck repair table algo.opay_passenger_finish_info').rdd.count()

def history_finish_map(x):
if x.phone_number.isdigit():
ret = {
"phone_number": x.phone_number,
"city_id": x.last_order_city_id,
"last_order_city_id": x.last_order_city_id,
"first_create_time": x.first_create_time,
"last_finish_time": x.last_finish_time,
"total_create_num": x.total_create_num,
"total_finish_num": x.total_finish_num,
"total_price": x.total_price,
"avg_will_pay": x.avg_will_pay,
"last_finish_day": x.last_finish_day,
"finish_frequency": x.finish_frequency,
"create_frequency": x.create_frequency,
"oride_finish_num": x.oride_finish_num,
"otrike_carpool_finish_num": x.otrike_carpool_finish_num,
"otrike_charter_finish_num": x.otrike_charter_finish_num,
"ocar_finish_num": x.ocar_finish_num
}
return json.dumps(ret)
else:
return

if __name__ == "__main__":
program_start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
spark, sc = init_spark('opay_passenger_order_info')
sc.setLogLevel("WARN")
ts = int(time.mktime(time.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())),"%Y-%m-%d")))
dt = str(sys.argv[2]) if len(sys.argv) > 2 else time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))
timeArray = time.strptime(dt, "%Y-%m-%d")
timestamp = int(time.mktime(timeArray))
dt1 = time.strftime('%Y-%m-%d', time.localtime(timestamp - 86400))
insert_passenger_data(passenger_order_query,history_finish_map,dt)

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!