0504-数仓搭建
第一章 ODS层
原始数据层,存放原始数据,直接加载原始日志、数据,数据保持原貌不做处理。
1.1 创建表
- 创建启动日志表ods_start_log
drop table if exists ods_start_log;
CREATE EXTERNAL TABLE `ods_start_log`(`line` string)
PARTITIONED BY (`dt` string)
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_start_log';
- 创建事件日志表ods_event_log
drop table if exists ods_event_log;
CREATE EXTERNAL TABLE `ods_event_log`(`line` string)
PARTITIONED BY (`dt` string)
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_event_log';
1.2 加载数据
- 启动日志数据
load data inpath '/origin_data/gmall/log/topic_start/2019-12-19' into table gmall.ods_start_log partition(dt='2019-12-19');
- 事件日志数据
load data inpath '/origin_data/gmall/log/topic_event/2019-12-19' into table gmall.ods_event_log partition(dt='2019-12-19');
1.3 ODS层加载数据脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $log_date==="
$hive -e "load data inpath '/origin_data/gmall/log/topic_start/$log_date' into table "$APP".ods_start_log partition(dt='$log_date')"
$hive -e "load data inpath '/origin_data/gmall/log/topic_event/$log_date' into table "$APP".ods_event_log partition(dt='$log_date')"
第二章 DWD层
对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式)。
2.1 数据解析
2.1.0 字段分析
- 公共字段
// p1
base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')
// result
m953 u979 11 1.0.8 es C 8.1.1 MX sumsung-1 Sumsung V2.0.5 TJHNUOJN@gmail.com 640*960 4G -102.0 10.4 1576741506064 [{"ett":"1576719842455","en":"display","kv":{"newsid":"n057","action":"1","extend1":"1","place":"4","category":"6"}}] 1576765728956
// p2
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]
as mid_id
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]
as user_id
...
- 事件字段
// 由 p2 的
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]
as ops,
// 得到的结果
[{"ett":"1576719842455","en":"display","kv":{"newsid":"n057","action":"1","extend1":"1","place":"4","category":"6"}},
{"ett":"1576676303669","en":"newsdetail","kv":{"entry":"3","newsid":"n993","news_staytime":"12","loading_time":"6","action":"2","showtype":"4","category":"40","type1":"325"}}]
// 是一个集合, 通过列转行的手段(集合转多行),将其炸开
lateral view flat_analizer(ops) tmp_k as event_name, event_json;
2.1.1 解析公共字段 UDF
package com.lz.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
/**
* @ClassName BaseFileUDF
* @Description:
* 1576765728919 | {
* "cm": {
* "ln": "-42.1",
* "sv": "V2.9.6",
* "os": "8.0.2",
* "g": "RG995Y50@gmail.com",
* "mid": "m706",
* "nw": "3G",
* "l": "es",
* "vc": "8",
* "hw": "750*1134",
* "ar": "MX",
* "uid": "u793",
* "t": "1576719015961",
* "la": "-21.7",
* "md": "HTC-16",
* "vn": "1.3.9",
* "ba": "HTC",
* "sr": "Z"
* },
* "ap": "gmall",
* "et": []
* }
* @Author MAlone
* @Date 2019/12/20
* @Version V1.0
**/
public class BaseFileUDF extends UDF {
public String evaluate(String line,String jsonKeysStr){
StringBuilder sb = new StringBuilder();
// 1. 切割key, mid uid vc vn l sr os ar md
String[] jsonKeys = jsonKeysStr.split(",");
// 2. 处理 line
String[] split = line.split("\\|");
// 3. 合法性校验
if (split.length != 2 || StringUtils.isBlank(split[0])) {
return "";
}
// 4. 处理json
try {
JSONObject jsonObject = new JSONObject(split[1]);
// 5. 获取cm
JSONObject base = jsonObject.getJSONObject("cm");
// 6. 循环遍历取值
for (int i = 0; i < jsonKeys.length; i++) {
String fieldName = jsonKeys[i].trim();
if (base.has(fieldName)) {
sb.append(base.getString(fieldName)).append("\t");
} else {
sb.append("").append("\t");
}
}
sb.append(jsonObject.getString("et")).append("\t");
sb.append(split[0]).append("\t");
} catch (JSONException e) {
e.printStackTrace();
}
return sb.toString();
}
}
2.1.2 解析具体事件字段 UDTF
package com.lz.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import java.util.ArrayList;
/**
* @ClassName EventJsonUDTF
* @Description:
* [{
* "ett": "1576719842455",
* "en": "display",
* "kv": {
* "newsid": "n057",
* "action": "1",
* "extend1": "1",
* "place": "4",
* "category": "6"
* }
* },
* {
* "ett": "1576676303669",
* "en": "newsdetail",
* "kv": {
* "entry": "3",
* "newsid": "n993",
* "news_staytime": "12",
* "loading_time": "6",
* "action": "2",
* "showtype": "4",
* "category": "40",
* "type1": "325"
* }
* }]
* @Author MAlone
* @Date 2019/12/20
* @Version V1.0
**/
public class EventJsonUDTF extends GenericUDTF {
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
// 指定输出参数的名称和类型
ArrayList<String> fieldNames = new ArrayList<>();
ArrayList<ObjectInspector> fieldOIS = new ArrayList<>();
fieldNames.add("event_name");
fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("event_json");
fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIS);
}
@Override
public void process(Object[] objects) throws HiveException {
// 获取传入的et
String input = objects[0].toString();
if (StringUtils.isBlank(input)) {
return;
} else {
try {
JSONArray jsonArray = new JSONArray(input);
if (jsonArray == null) {
return;
}
for (int i = 0; i < jsonArray.length(); i++) {
String[] result = new String[2];
// 取出每个事件名称
result[0] = jsonArray.getJSONObject(i).getString("en");
// 取出整个事件
result[1] = jsonArray.getString(i);
// 返回结果
forward(result);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
@Override
public void close() throws HiveException {
}
}
2.2 基础明细表
2.2.1 启动日志基础明细表
- 创建表
drop table if exists dwd_base_start_log;
CREATE EXTERNAL TABLE `dwd_base_start_log`
(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string,
`event_json` string,
`server_time` string
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_start_log/';
- 导数据
insert overwrite table dwd_base_start_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
app_time ,
network ,
lng ,
lat ,
event_name ,
event_json ,
server_time ,
dt
from
(
select
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time,
dt
from ods_start_log where dt='2019-12-19' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
2.2.2 事件日志基础明细表
- 创建表
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE `dwd_base_event_log`
(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string,
`event_json` string,
`server_time` string
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/';
- 导数据
insert overwrite table dwd_base_event_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
app_time ,
network ,
lng ,
lat ,
event_name ,
event_json ,
server_time ,
dt
from
(
select
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time,
dt
from ods_event_log where dt='2019-12-19' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
2.2.3 DWD层基础明细数据解析脚本
- dwd.base.sh
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
add jar /opt/module/hive/dwd-etl-1.0-SNAPSHOT.jar;
create temporary function base_analizer as 'com.lz.udf.BaseFieldUDF';
create temporary function flat_analizer as 'com.lz.udtf.EventJsonUDTF';
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dwd_base_start_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
network ,
lng ,
lat ,
app_time ,
event_name ,
event_json ,
server_time ,
dt
from
(
select
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time,
dt
from "$APP".ods_start_log where dt='$log_date' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
insert overwrite table "$APP".dwd_base_event_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
network ,
lng ,
lat ,
app_time ,
event_name ,
event_json ,
server_time ,
dt
from
(
select
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] as mid_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] as user_id,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] as version_code,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] as version_name,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] as lang,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] as source,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] as os,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] as area,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] as model,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] as brand,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] as sdk_version,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] as gmail,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] as height_width,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] as app_time,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] as network,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] as lng,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] as lat,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] as ops,
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] as server_time,
dt
from "$APP".ods_event_log where dt='$log_date' and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
"
$hive -e "$sql"
- 脚本使用
[yanlzh@node11 module]$ dwd_base.sh 2019-02-11
2.3 具体事件表
2.3.1 商品点击表
drop table if exists dwd_display_log;
CREATE EXTERNAL TABLE `dwd_display_log`
(
action string,
newsid string,
place string,
extend1 string,
category string,
`server_time` string
)
PARTITIONED BY (dt string)
location '/warehouse/gmall/dwd/dwd_display_log/';
insert overwrite table dwd_display_log
PARTITION (dt)
select
get_json_object(event_json, '$.kv.action') action,
get_json_object(event_json, '$.kv.newsid') newsid,
get_json_object(event_json, '$.kv.place') place,
get_json_object(event_json, '$.kv.extend1') extend1,
get_json_object(event_json, '$.kv.category') category,
server_time,
dt
from dwd_base_event_log
where dt = '2019-12-19'
and event_name = 'display';
2.3.2 商品详情表
drop table if exists dwd_newsdetail_log;
CREATE EXTERNAL TABLE `dwd_newsdetail_log`
(
action string,
newsid string,
showtype string,
news_staytime string,
loading_time string,
type1 string,
category string,
`server_time` string
)
PARTITIONED BY (dt string)
location '/warehouse/gmall/dwd/dwd_newsdetail_log/';
insert overwrite table dwd_newsdetail_log
PARTITION (dt)
select
get_json_object(event_json, '$.kv.entry') entry,
get_json_object(event_json, '$.kv.action') action,
get_json_object(event_json, '$.kv.newsid') newsid,
get_json_object(event_json, '$.kv.showtype') showtype,
get_json_object(event_json, '$.kv.news_staytime') news_staytime,
get_json_object(event_json, '$.kv.loading_time') loading_time,
get_json_object(event_json, '$.kv.type1') type1,
get_json_object(event_json, '$.kv.category') category,
server_time,
dt
from dwd_base_event_log
where dt = '2019-12-19'
and event_name = 'newsdetail';
2.3.3 商品列表详情表
2.3.4 广告表
2.3.5 消息通知表
2.3.6 用户前台活跃表
2.3.7 用户后台活跃表
2.3.8 评论表
2.3.9 收藏表
2.3.10 点赞表
2.3.11 启动日志表
2.3.12 错误日志表
2.3.13 DWD层数据加载脚本
- dwd.event.sh
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dwd_display_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.newsid') newsid,
get_json_object(event_json,'$.kv.place') place,
get_json_object(event_json,'$.kv.extend1') extend1,
get_json_object(event_json,'$.kv.category') category,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='display';
insert overwrite table "$APP".dwd_newsdetail_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.newsid') newsid,
get_json_object(event_json,'$.kv.showtype') showtype,
get_json_object(event_json,'$.kv.news_staytime') news_staytime,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.type1') type1,
get_json_object(event_json,'$.kv.category') category,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='newsdetail';
insert overwrite table "$APP".dwd_loading_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.loading_way') loading_way,
get_json_object(event_json,'$.kv.extend1') extend1,
get_json_object(event_json,'$.kv.extend2') extend2,
get_json_object(event_json,'$.kv.type') type,
get_json_object(event_json,'$.kv.type1') type1,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='loading';
insert overwrite table "$APP".dwd_ad_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.content') content,
get_json_object(event_json,'$.kv.detail') detail,
get_json_object(event_json,'$.kv.source') ad_source,
get_json_object(event_json,'$.kv.behavior') behavior,
get_json_object(event_json,'$.kv.newstype') newstype,
get_json_object(event_json,'$.kv.show_style') show_style,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='ad';
insert overwrite table "$APP".dwd_notification_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.noti_type') noti_type,
get_json_object(event_json,'$.kv.ap_time') ap_time,
get_json_object(event_json,'$.kv.content') content,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='notification';
insert overwrite table "$APP".dwd_active_foreground_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.active_source') active_source,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='active_background';
insert overwrite table "$APP".dwd_active_background_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.active_source') active_source,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='active_background';
insert overwrite table "$APP".dwd_comment_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.comment_id') comment_id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.p_comment_id') p_comment_id,
get_json_object(event_json,'$.kv.content') content,
get_json_object(event_json,'$.kv.addtime') addtime,
get_json_object(event_json,'$.kv.other_id') other_id,
get_json_object(event_json,'$.kv.praise_count') praise_count,
get_json_object(event_json,'$.kv.reply_count') reply_count,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='comment';
insert overwrite table "$APP".dwd_favorites_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.id') id,
get_json_object(event_json,'$.kv.course_id') course_id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.add_time') add_time,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='favorites';
insert overwrite table "$APP".dwd_praise_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.id') id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.target_id') target_id,
get_json_object(event_json,'$.kv.type') type,
get_json_object(event_json,'$.kv.add_time') add_time,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='praise';
insert overwrite table "$APP".dwd_start_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.open_ad_type') open_ad_type,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.detail') detail,
get_json_object(event_json,'$.kv.extend1') extend1,
server_time,
dt
from "$APP".dwd_base_start_log
where dt='$log_date' and event_name='start';
insert overwrite table "$APP".dwd_error_log
PARTITION (dt)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.errorBrief') errorBrief,
get_json_object(event_json,'$.kv.errorDetail') errorDetail,
server_time,
dt
from "$APP".dwd_base_event_log
where dt='$log_date' and event_name='error';
"
$hive -e "$sql"
2.4 小结
2.4.1 UDF
2.4.2 UDTF
2.4.3 一些工具类
- StringUtils
- JSONObject
- JSONArray
2.4.4 列转行 lateral view
2.4.5 Hive系统函数 get_json_object
第三章 DWS层
DWS层的数据要建立在业务需求的基础上
3.1 用户活跃
- 活跃用户
打开应用的用户即为活跃用户,不考虑用户的使用情况。每天一台设备打开多次会被计为一个活跃用户。- collect_set()函数
3.1.1 每日活跃设备明细
以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一
drop table if exists dws_uv_detail_day;
create table dws_uv_detail_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按天明细'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day/'
;
insert overwrite table dws_uv_detail_day partition (dt)
select mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0] lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0] app_time,
collect_set(network)[0] network,
collect_set(lng)[0] lng,
collect_set(lat)[0] lat,
'2019-12-19'
from dwd_start_log
where dt = '2019-12-19'
group by mid_id;
3.1.2 每周活跃设备明细
drop table if exists dws_uv_detail_wk;
create table dws_uv_detail_wk
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`monday_date` string COMMENT '周一日期',
`sunday_date` string COMMENT '周日日期'
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/'
;
insert overwrite table dws_uv_detail_wk partition (wk_dt)
select mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0] lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0] app_time,
collect_set(network)[0] network,
collect_set(lng)[0] lng,
collect_set(lat)[0] lat,
date_add(next_day('2019-12-19', 'MO'), -7),
date_add(next_day('2019-12-19', 'MO'), -1),
concat(date_add(next_day('2019-12-19', 'MO'), -7), '_', date_add(next_day('2019-12-19', 'MO'), -1))
from dws_uv_detail_day
where dt >= date_add(next_day('2019-12-19', 'MO'), -7)
and dt <= date_add(next_day('2019-12-19', 'MO'), -1)
group by mid_id;
3.1.3 每月活跃设备明细
drop table if exists dws_uv_detail_mn;
create external table dws_uv_detail_mn
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/'
;
insert overwrite table dws_uv_detail_mn partition (mn)
select mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0] lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0] app_time,
collect_set(network)[0] network,
collect_set(lng)[0] lng,
collect_set(lat)[0] lat,
date_format('2019-12-19', 'yyyy-MM')
from dws_uv_detail_day
where date_format(dt, 'yyyy-MM') = date_format('2019-12-19', 'yyyy-MM')
group by mid_id;
3.1.4 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dws_uv_detail_day partition(dt='$log_date')
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat
from "$APP".dwd_start_log
where dt='$log_date'
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_add(next_day('$log_date','MO'),-7),
date_add(next_day('$log_date','SU'),-7),
concat(date_add( next_day('$log_date','MO'),-7), '_' , date_add(next_day('$log_date','MO'),-1)
)
from "$APP".dws_uv_detail_day
where dt>=date_add(next_day('$log_date','MO'),-7) and dt<=date_add(next_day('$log_date','MO'),-1)
group by mid_id,lang,gmail,app_time,lng,lat;
insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_format('$log_date','yyyy-MM')
from "$APP".dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('$log_date','yyyy-MM')
group by mid_id,lang,gmail,app_time,lng,lat;
"
$hive -e "$sql"
3.2 用户新增
3.2.1 每日新增用户明细
drop table if exists `dws_new_mid_day`;
create table `dws_new_mid_day`
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '创建时间'
) COMMENT '每日新增设备信息'
stored as parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';
insert into table dws_new_mid_day
select ud.mid_id,
ud.user_id,
ud.version_code,
ud.version_name,
ud.lang,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.network,
ud.lng,
ud.lat,
'2019-12-19'
from dws_uv_detail_day ud
left join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
and nm.mid_id is null;
3.2.2 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".dws_new_mid_day
select ud.mid_id,
ud.user_id,
ud.version_code,
ud.version_name,
ud.lang,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.network,
ud.lng,
ud.lat,
'$log_date'
from "$APP".dws_uv_detail_day ud
left join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
and nm.mid_id is null;
"
$hive -e "$sql"
3.3 用户留存
3.3.0 留存概念
- 留存用户:
某段时间内的新增用户(活跃用户),经过一段时间后, 又继续使用的被认作是留存用户 - 留存率
留存用户占当时新增用户(活跃用户)的比例
3.3.1 每日留存用户明细
drop table if exists `dws_user_retention_day`;
create table `dws_user_retention_day`
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day/'
;
- 导入数据(每天计算前一天的新用户访问留存明细,
昨天新增的且今天活跃
)
insert overwrite table dws_user_retention_day partition (dt = "2019-12-19")
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud
join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
and nm.create_date = date_add('2019-12-19', -1);
3.3.2 1,2,3,n天留存用户明细
前一天新增的且今天活跃, 前两天新增的且今天活跃,
- 表:
新增表
活跃表- 条件
(1) 新增表. 新增日期 = 当前日期 - n天nm.create_date = date_add('2019-12-19', -n)
(2) 活跃表. 活跃日期 = 当前日期(今天活跃了)ud.dt = '2019-12-19'
(3) 两表连接ud.mid_id = nm.mid_id
insert overwrite table dws_user_retention_day partition (dt = "2019-12-19")
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud
join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
and nm.create_date = date_add('2019-12-19', -1)
union all
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud
join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
and nm.create_date = date_add('2019-12-19', -2)
union all
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud
join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
and nm.create_date = date_add('2019-12-19', -3);
3.3.3 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dws_user_retention_day partition (dt = "$log_date")
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from "$APP".dws_uv_detail_day ud
join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
and nm.create_date = date_add('$log_date', -1)
union all
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from "$APP".dws_uv_detail_day ud
join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
and nm.create_date = date_add('$log_date', -2)
union all
select nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from "$APP".dws_uv_detail_day ud
join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
and nm.create_date = date_add('$log_date', -3);
"
$hive -e "$sql"
3.4 小结
3.4.1 日期处理函数
1)date_format函数(根据格式整理日期)
hive (gmall)> select date_format('2019-02-10','yyyy-MM');
2019-02
2)date_add函数(加减日期)
hive (gmall)> select date_add('2019-02-10',-1);
2019-02-09
hive (gmall)> select date_add('2019-02-10',1);
2019-02-11
3)next_day函数
(1)取当前天的下一周的周一
hive (gmall)> select next_day('2019-02-12','MO')
2019-02-18
(2)取当前周的周一
hive (gmall)> select date_add(next_day('2019-02-12','MO'),-7);
2019-02-11
4)last_day函数(求当月最后一天日期)
hive (gmall)> select last_day('2019-02-10');
2019-02-28
第四章 ADS层
4.1 用户活跃
4.1.1活跃设备数
drop table if exists ads_uv_count;
create external table ads_uv_count
(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '每日活跃用户数量'
stored as parquet
location '/warehouse/gmall/ads/ads_uv_count_day/'
;
insert overwrite table ads_uv_count
select '2019-12-19' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('2019-12-19', 'MO'), -1) = '2019-12-19', 'Y', 'N'),
if(last_day('2019-12-19') = '2019-12-19', 'Y', 'N')
from (
select '2019-12-19' dt,
count(*) ct
from dws_uv_detail_day
where dt = '2019-12-19'
) daycount
join
(
select '2019-12-19' dt,
count(*) ct
from dws_uv_detail_wk
where wk_dt =
concat(date_add(next_day('2019-12-19', 'MO'), -7), '_', date_add(next_day('2019-12-19', 'MO'), -1))
) wkcount on daycount.dt = wkcount.dt
join
(
select '2019-12-19' dt,
count(*) ct
from dws_uv_detail_mn
where mn = date_format('2019-12-19', 'yyyy-MM')
) mncount on daycount.dt = mncount.dt
;
4.1.2 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".ads_uv_count
select '$log_date' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('$log_date', 'MO'), -1) = '$log_date', 'Y', 'N'),
if(last_day('$log_date') = '$log_date', 'Y', 'N')
from (
select '$log_date' dt,
count(*) ct
from "$APP".dws_uv_detail_day
where dt = '$log_date'
) daycount
join
(
select '$log_date' dt,
count(*) ct
from "$APP".dws_uv_detail_wk
where wk_dt =
concat(date_add(next_day('$log_date', 'MO'), -7), '_', date_add(next_day('$log_date', 'MO'), -1))
) wkcount on daycount.dt = wkcount.dt
join
(
select '$log_date' dt,
count(*) ct
from "$APP".dws_uv_detail_mn
where mn = date_format('$log_date', 'yyyy-MM')
) mncount on daycount.dt = mncount.dt
;
"
$hive -e "$sql"
4.2 用户新增
4.2.1 每日新增用户数
drop table if exists `ads_new_mid_count`;
create table `ads_new_mid_count`
(
`create_date` string comment '创建时间',
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
insert into table ads_new_mid_count
select create_date, count(*)
from dws_new_mid_day
where create_date = '2019-12-19'
group by create_date;
4.2.2 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_new_mid_count
select create_date, count(*)
from "$APP".dws_new_mid_day
where create_date = '$log_date'
group by create_date;
"
$hive -e "$sql"
4.3 用户留存
4.3.1 留存用户数
drop table if exists `ads_user_retention_day_count`;
create table `ads_user_retention_day_count`
(
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
insert into table ads_user_retention_day_count
select create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt = '2019-12-19'
group by create_date, retention_day;
4.3.2 留存用户率
drop table if exists `ads_user_retention_day_rate`;
create table `ads_user_retention_day_rate`
(
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` string comment '当日设备新增数量',
`retention_ratio` decimal(10, 2) comment '留存率'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
insert into table ads_user_retention_day_rate
select '2019-12-19',
ur.create_date,
ur.retention_day,
ur.retention_count,
nc.new_mid_count,
ur.retention_count / nc.new_mid_count * 100
from (
select create_date,
retention_day,
count(*) retention_count
from `dws_user_retention_day`
where dt = '2019-12-19'
group by create_date, retention_day
) ur
join ads_new_mid_count nc on nc.create_date = ur.create_date;
4.3.3 脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_user_retention_day_count
select create_date,
retention_day,
count(*) retention_count
from "$APP".dws_user_retention_day
where dt = '$log_date'
group by create_date, retention_day;
insert into table "$APP".ads_user_retention_day_rate
select '$log_date',
ur.create_date,
ur.retention_day,
ur.retention_count,
nc.new_mid_count,
ur.retention_count / nc.new_mid_count * 100
from (
select create_date,
retention_day,
count(*) retention_count
from `"$APP".dws_user_retention_day`
where dt = '$log_date'
group by create_date, retention_day
) ur
join "$APP".ads_new_mid_count nc on nc.create_date = ur.create_date;
"
$hive -e "$sql"
来源:CSDN
作者:YanLzh_MAlone
链接:https://blog.csdn.net/qq_35199832/article/details/103630130