0504-数仓搭建

核能气质少年 提交于 2019-12-21 11:17:14

第一章 ODS层

原始数据层,存放原始数据,直接加载原始日志、数据,数据保持原貌不做处理。

1.1 创建表

  1. 创建启动日志表ods_start_log
drop table if exists ods_start_log;
CREATE EXTERNAL TABLE  `ods_start_log`(`line` string)
PARTITIONED BY (`dt` string)
STORED AS
  INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_start_log';
  1. 创建事件日志表ods_event_log
drop table if exists ods_event_log;
CREATE EXTERNAL TABLE  `ods_event_log`(`line` string)
PARTITIONED BY (`dt` string)
STORED AS
  INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_event_log';

1.2 加载数据

  1. 启动日志数据
load data inpath '/origin_data/gmall/log/topic_start/2019-12-19' into table gmall.ods_start_log partition(dt='2019-12-19');
  1. 事件日志数据
load data inpath '/origin_data/gmall/log/topic_event/2019-12-19' into table gmall.ods_event_log partition(dt='2019-12-19');

1.3 ODS层加载数据脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
 log_date=$1
else 
 log_date=`date  -d "-1 day"  +%F`  
fi 

echo "===日志日期为 $log_date==="
$hive -e "load data inpath '/origin_data/gmall/log/topic_start/$log_date' into table "$APP".ods_start_log partition(dt='$log_date')"
$hive -e "load data inpath '/origin_data/gmall/log/topic_event/$log_date' into table "$APP".ods_event_log partition(dt='$log_date')"

第二章 DWD层

对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式)。

2.1 数据解析

2.1.0 字段分析

在这里插入图片描述

  1. 公共字段
// p1
base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')

// result
m953	u979	11	1.0.8	es	C	8.1.1	MX	sumsung-1	Sumsung	V2.0.5	TJHNUOJN@gmail.com	640*960	4G	-102.0	10.4	1576741506064	[{"ett":"1576719842455","en":"display","kv":{"newsid":"n057","action":"1","extend1":"1","place":"4","category":"6"}}]	1576765728956
// p2
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]
as mid_id
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]
as user_id
...
  1. 事件字段
// 由 p2 的
split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  
as ops,
// 得到的结果
[{"ett":"1576719842455","en":"display","kv":{"newsid":"n057","action":"1","extend1":"1","place":"4","category":"6"}},
{"ett":"1576676303669","en":"newsdetail","kv":{"entry":"3","newsid":"n993","news_staytime":"12","loading_time":"6","action":"2","showtype":"4","category":"40","type1":"325"}}]
// 是一个集合, 通过列转行的手段(集合转多行),将其炸开
lateral view flat_analizer(ops) tmp_k as event_name, event_json;

2.1.1 解析公共字段 UDF

package com.lz.udf;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * @ClassName BaseFileUDF
 * @Description:
 * 1576765728919 | {
 * 	"cm": {
 * 		"ln": "-42.1",
 * 		"sv": "V2.9.6",
 * 		"os": "8.0.2",
 * 		"g": "RG995Y50@gmail.com",
 * 		"mid": "m706",
 * 		"nw": "3G",
 * 		"l": "es",
 * 		"vc": "8",
 * 		"hw": "750*1134",
 * 		"ar": "MX",
 * 		"uid": "u793",
 * 		"t": "1576719015961",
 * 		"la": "-21.7",
 * 		"md": "HTC-16",
 * 		"vn": "1.3.9",
 * 		"ba": "HTC",
 * 		"sr": "Z"
 *   },
 * 	"ap": "gmall",
 * 	"et": []
 * }
 * @Author MAlone
 * @Date 2019/12/20
 * @Version V1.0
 **/
public class BaseFileUDF extends UDF {
    public String evaluate(String line,String jsonKeysStr){

        StringBuilder sb = new StringBuilder();
        // 1. 切割key, mid uid vc vn l sr os ar md
        String[] jsonKeys = jsonKeysStr.split(",");

        // 2. 处理 line
        String[] split = line.split("\\|");

        // 3. 合法性校验
        if (split.length != 2 || StringUtils.isBlank(split[0])) {
            return "";
        }
        // 4. 处理json
        try {
            JSONObject jsonObject = new JSONObject(split[1]);

            // 5. 获取cm
            JSONObject base = jsonObject.getJSONObject("cm");
            // 6. 循环遍历取值
            for (int i = 0; i < jsonKeys.length; i++) {
                String fieldName = jsonKeys[i].trim();

                if (base.has(fieldName)) {
                    sb.append(base.getString(fieldName)).append("\t");
                } else {
                    sb.append("").append("\t");
                }
            }
            sb.append(jsonObject.getString("et")).append("\t");
            sb.append(split[0]).append("\t");

        } catch (JSONException e) {
            e.printStackTrace();
        }

        return sb.toString();
    }
}

2.1.2 解析具体事件字段 UDTF

package com.lz.udtf;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;

import java.util.ArrayList;

/**
 * @ClassName EventJsonUDTF
 * @Description:
 * [{
 * 		"ett": "1576719842455",
 * 		"en": "display",
 * 		"kv": {
 * 			"newsid": "n057",
 * 			"action": "1",
 * 			"extend1": "1",
 * 			"place": "4",
 * 			"category": "6"
 *          }
 *  },
 *  {
 * 		"ett": "1576676303669",
 * 		"en": "newsdetail",
 * 		"kv": {
 * 			"entry": "3",
 * 			"newsid": "n993",
 * 			"news_staytime": "12",
 * 			"loading_time": "6",
 * 			"action": "2",
 * 			"showtype": "4",
 * 			"category": "40",
 * 			"type1": "325"
 *        }
 *  }]
 * @Author MAlone
 * @Date 2019/12/20
 * @Version V1.0
 **/
public class EventJsonUDTF extends GenericUDTF {

    @Override
    public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
        // 指定输出参数的名称和类型
        ArrayList<String> fieldNames = new ArrayList<>();
        ArrayList<ObjectInspector> fieldOIS = new ArrayList<>();

        fieldNames.add("event_name");
        fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("event_json");
        fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIS);
    }

    @Override
    public void process(Object[] objects) throws HiveException {
        // 获取传入的et
        String input = objects[0].toString();
        if (StringUtils.isBlank(input)) {
            return;
        } else {
            try {
                JSONArray jsonArray = new JSONArray(input);
                if (jsonArray == null) {
                    return;
                }
                for (int i = 0; i < jsonArray.length(); i++) {
                    String[] result = new String[2];
                    // 取出每个事件名称
                    result[0] = jsonArray.getJSONObject(i).getString("en");
                    // 取出整个事件
                    result[1] = jsonArray.getString(i);
                    // 返回结果
                    forward(result);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    @Override
    public void close() throws HiveException {

    }
}

2.2 基础明细表

2.2.1 启动日志基础明细表

  1. 创建表
drop table if exists dwd_base_start_log;
CREATE EXTERNAL TABLE `dwd_base_start_log`
(
    `mid_id`       string,
    `user_id`      string,
    `version_code` string,
    `version_name` string,
    `lang`         string,
    `source`       string,
    `os`           string,
    `area`         string,
    `model`        string,
    `brand`        string,
    `sdk_version`  string,
    `gmail`        string,
    `height_width` string,
    `app_time`     string,
    `network`      string,
    `lng`          string,
    `lat`          string,
    `event_name`   string,
    `event_json`   string,
    `server_time`  string
)
    PARTITIONED BY (`dt` string)
    stored as parquet
    location '/warehouse/gmall/dwd/dwd_base_start_log/';
  1. 导数据
insert overwrite table dwd_base_start_log
    PARTITION (dt)
select
    mid_id,
    user_id,
    version_code,
    version_name,
    lang,
    source ,
    os ,
    area ,
    model ,
    brand ,
    sdk_version ,
    gmail ,
    height_width ,
    app_time ,
    network ,
    lng ,
    lat ,
    event_name ,
    event_json ,
    server_time ,
    dt
from
    (
        select
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]   as mid_id,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]   as user_id,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2]   as version_code,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3]   as version_name,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4]   as lang,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5]   as source,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6]   as os,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7]   as area,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8]   as model,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9]   as brand,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10]   as sdk_version,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11]  as gmail,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12]  as height_width,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13]  as app_time,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14]  as network,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15]  as lng,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16]  as lat,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  as ops,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18]  as server_time,
            dt
        from ods_start_log where dt='2019-12-19'  and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
    ) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;

2.2.2 事件日志基础明细表

  1. 创建表
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE `dwd_base_event_log`
(
    `mid_id`       string,
    `user_id`      string,
    `version_code` string,
    `version_name` string,
    `lang`         string,
    `source`       string,
    `os`           string,
    `area`         string,
    `model`        string,
    `brand`        string,
    `sdk_version`  string,
    `gmail`        string,
    `height_width` string,
    `app_time`     string,
    `network`      string,
    `lng`          string,
    `lat`          string,
    `event_name`   string,
    `event_json`   string,
    `server_time`  string
)
    PARTITIONED BY (`dt` string)
    stored as parquet
    location '/warehouse/gmall/dwd/dwd_base_event_log/';
  1. 导数据
insert overwrite table dwd_base_event_log
    PARTITION (dt)
select
    mid_id,
    user_id,
    version_code,
    version_name,
    lang,
    source ,
    os ,
    area ,
    model ,
    brand ,
    sdk_version ,
    gmail ,
    height_width ,
    app_time ,
    network ,
    lng ,
    lat ,
    event_name ,
    event_json ,
    server_time ,
    dt
from
    (
        select
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]   as mid_id,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]   as user_id,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2]   as version_code,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3]   as version_name,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4]   as lang,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5]   as source,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6]   as os,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7]   as area,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8]   as model,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9]   as brand,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10]   as sdk_version,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11]  as gmail,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12]  as height_width,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13]  as app_time,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14]  as network,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15]  as lng,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16]  as lat,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  as ops,
            split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18]  as server_time,
            dt
        from ods_event_log where dt='2019-12-19'  and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
    ) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;

2.2.3 DWD层基础明细数据解析脚本

  • dwd.base.sh
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date  -d "-1 day"  +%F`  
fi 

sql="
	add jar /opt/module/hive/dwd-etl-1.0-SNAPSHOT.jar;

	create temporary function base_analizer as 'com.lz.udf.BaseFieldUDF';
	create temporary function flat_analizer as 'com.lz.udtf.EventJsonUDTF';

 	set hive.exec.dynamic.partition.mode=nonstrict;

	insert overwrite table "$APP".dwd_base_start_log 
	PARTITION (dt)
	select
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source ,
	os ,
	area ,
	model ,
	brand ,
	sdk_version ,
	gmail ,
	height_width ,
	network ,
	lng ,
	lat ,
	app_time ,
	event_name , 
	event_json , 
	server_time , 
	dt  
	 from
	(
	select
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]   as mid_id,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]   as user_id,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2]   as version_code,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3]   as version_name,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4]   as lang,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5]   as source,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6]   as os,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7]   as area,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8]   as model,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9]   as brand,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10]   as sdk_version,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11]  as gmail,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12]  as height_width,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13]  as app_time,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14]  as network,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15]  as lng,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16]  as lat,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  as ops,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18]  as server_time,
	dt 
	from "$APP".ods_start_log where dt='$log_date'  and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>'' 
	) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;

	insert overwrite table "$APP".dwd_base_event_log 
	PARTITION (dt)
	select
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source ,
	os ,
	area ,
	model ,
	brand ,
	sdk_version ,
	gmail ,
	height_width ,
	network ,
	lng ,
	lat ,
	app_time ,
	event_name , 
	event_json , 
	server_time , 
	dt  
	 from
	(
	select
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]   as mid_id,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]   as user_id,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2]   as version_code,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3]   as version_name,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4]   as lang,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5]   as source,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6]   as os,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7]   as area,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8]   as model,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9]   as brand,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10]   as sdk_version,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11]  as gmail,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12]  as height_width,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13]  as app_time,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14]  as network,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15]  as lng,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16]  as lat,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  as ops,
	split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18]  as server_time,
	dt 
	from "$APP".ods_event_log where dt='$log_date'  and base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>'' 
	) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
"

$hive -e "$sql"
  • 脚本使用
[yanlzh@node11 module]$ dwd_base.sh 2019-02-11

2.3 具体事件表

2.3.1 商品点击表

drop table if exists dwd_display_log;
CREATE EXTERNAL TABLE `dwd_display_log`
(
    action         string,
    newsid         string,
    place          string,
    extend1        string,
    category       string,
    `server_time`  string
)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_display_log/';
insert overwrite table dwd_display_log
    PARTITION (dt)
select
       get_json_object(event_json, '$.kv.action')   action,
       get_json_object(event_json, '$.kv.newsid')   newsid,
       get_json_object(event_json, '$.kv.place')    place,
       get_json_object(event_json, '$.kv.extend1')  extend1,
       get_json_object(event_json, '$.kv.category') category,
       server_time,
       dt
from dwd_base_event_log
where dt = '2019-12-19'
  and event_name = 'display';

2.3.2 商品详情表

drop table if exists dwd_newsdetail_log;
CREATE EXTERNAL TABLE `dwd_newsdetail_log`
(
    action         string,
    newsid         string,
    showtype       string,
    news_staytime  string,
    loading_time   string,
    type1          string,
    category       string,
    `server_time`  string
)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_newsdetail_log/';
insert overwrite table dwd_newsdetail_log
    PARTITION (dt)
select
       get_json_object(event_json, '$.kv.entry')         entry,
       get_json_object(event_json, '$.kv.action')        action,
       get_json_object(event_json, '$.kv.newsid')        newsid,
       get_json_object(event_json, '$.kv.showtype')      showtype,
       get_json_object(event_json, '$.kv.news_staytime') news_staytime,
       get_json_object(event_json, '$.kv.loading_time')  loading_time,
       get_json_object(event_json, '$.kv.type1')         type1,
       get_json_object(event_json, '$.kv.category')      category,
       server_time,
       dt
from dwd_base_event_log
where dt = '2019-12-19'
  and event_name = 'newsdetail';

2.3.3 商品列表详情表



2.3.4 广告表



2.3.5 消息通知表



2.3.6 用户前台活跃表



2.3.7 用户后台活跃表



2.3.8 评论表



2.3.9 收藏表



2.3.10 点赞表



2.3.11 启动日志表



2.3.12 错误日志表



2.3.13 DWD层数据加载脚本

  • dwd.event.sh
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date  -d "-1 day"  +%F`  
fi 
sql="

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table "$APP".dwd_display_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.newsid') newsid,
	get_json_object(event_json,'$.kv.place') place,
	get_json_object(event_json,'$.kv.extend1') extend1,
	get_json_object(event_json,'$.kv.category') category,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='display';


insert overwrite table "$APP".dwd_newsdetail_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.entry') entry,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.newsid') newsid,
	get_json_object(event_json,'$.kv.showtype') showtype,
	get_json_object(event_json,'$.kv.news_staytime') news_staytime,
	get_json_object(event_json,'$.kv.loading_time') loading_time,
	get_json_object(event_json,'$.kv.type1') type1,
	get_json_object(event_json,'$.kv.category') category,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='newsdetail';


insert overwrite table "$APP".dwd_loading_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.loading_time') loading_time,
	get_json_object(event_json,'$.kv.loading_way') loading_way,
	get_json_object(event_json,'$.kv.extend1') extend1,
	get_json_object(event_json,'$.kv.extend2') extend2,
	get_json_object(event_json,'$.kv.type') type,
	get_json_object(event_json,'$.kv.type1') type1,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='loading';


insert overwrite table "$APP".dwd_ad_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.entry') entry,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.content') content,
	get_json_object(event_json,'$.kv.detail') detail,
	get_json_object(event_json,'$.kv.source') ad_source,
	get_json_object(event_json,'$.kv.behavior') behavior,
	get_json_object(event_json,'$.kv.newstype') newstype,
	get_json_object(event_json,'$.kv.show_style') show_style,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='ad';


insert overwrite table "$APP".dwd_notification_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.noti_type') noti_type,
	get_json_object(event_json,'$.kv.ap_time') ap_time,
	get_json_object(event_json,'$.kv.content') content,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='notification';


insert overwrite table "$APP".dwd_active_foreground_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.active_source') active_source,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='active_background';


insert overwrite table "$APP".dwd_active_background_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.active_source') active_source,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='active_background';


insert overwrite table "$APP".dwd_comment_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.comment_id') comment_id,
	get_json_object(event_json,'$.kv.userid') userid,
	get_json_object(event_json,'$.kv.p_comment_id') p_comment_id,
	get_json_object(event_json,'$.kv.content') content,
	get_json_object(event_json,'$.kv.addtime') addtime,
	get_json_object(event_json,'$.kv.other_id') other_id,
	get_json_object(event_json,'$.kv.praise_count') praise_count,
	get_json_object(event_json,'$.kv.reply_count') reply_count,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='comment';


insert overwrite table "$APP".dwd_favorites_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.id') id,
	get_json_object(event_json,'$.kv.course_id') course_id,
	get_json_object(event_json,'$.kv.userid') userid,
	get_json_object(event_json,'$.kv.add_time') add_time,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='favorites';


insert overwrite table "$APP".dwd_praise_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.id') id,
	get_json_object(event_json,'$.kv.userid') userid,
	get_json_object(event_json,'$.kv.target_id') target_id,
	get_json_object(event_json,'$.kv.type') type,
	get_json_object(event_json,'$.kv.add_time') add_time,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='praise';


insert overwrite table "$APP".dwd_start_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.entry') entry,
	get_json_object(event_json,'$.kv.open_ad_type') open_ad_type,
	get_json_object(event_json,'$.kv.action') action,
	get_json_object(event_json,'$.kv.loading_time') loading_time,
	get_json_object(event_json,'$.kv.detail') detail,
	get_json_object(event_json,'$.kv.extend1') extend1,
	server_time,
	dt
from "$APP".dwd_base_start_log 
where dt='$log_date' and event_name='start';


insert overwrite table "$APP".dwd_error_log
PARTITION (dt)
select 
	mid_id,
	user_id,
	version_code,
	version_name,
	lang,
	source,
	os,
	area,
	model,
	brand,
	sdk_version,
	gmail,
	height_width,
	app_time,
	network,
	lng,
	lat,
	get_json_object(event_json,'$.kv.errorBrief') errorBrief,
	get_json_object(event_json,'$.kv.errorDetail') errorDetail,
	server_time,
	dt
from "$APP".dwd_base_event_log 
where dt='$log_date' and event_name='error';

"

$hive -e "$sql"

2.4 小结

2.4.1 UDF

2.4.2 UDTF

2.4.3 一些工具类

  1. StringUtils
  2. JSONObject
  3. JSONArray

2.4.4 列转行 lateral view

2.4.5 Hive系统函数 get_json_object

第三章 DWS层

DWS层的数据要建立在业务需求的基础上

3.1 用户活跃

  • 活跃用户
    打开应用的用户即为活跃用户,不考虑用户的使用情况。每天一台设备打开多次会被计为一个活跃用户。
  • collect_set()函数

在这里插入图片描述

3.1.1 每日活跃设备明细

以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一

drop table if exists dws_uv_detail_day;
create table dws_uv_detail_day
(
    `mid_id`       string COMMENT '设备唯一标识',
    `user_id`      string COMMENT '用户标识',
    `version_code` string COMMENT '程序版本号',
    `version_name` string COMMENT '程序版本名',
    `lang`         string COMMENT '系统语言',
    `source`       string COMMENT '渠道号',
    `os`           string COMMENT '安卓系统版本',
    `area`         string COMMENT '区域',
    `model`        string COMMENT '手机型号',
    `brand`        string COMMENT '手机品牌',
    `sdk_version`  string COMMENT 'sdkVersion',
    `gmail`        string COMMENT 'gmail',
    `height_width` string COMMENT '屏幕宽高',
    `app_time`     string COMMENT '客户端日志产生时的时间',
    `network`      string COMMENT '网络模式',
    `lng`          string COMMENT '经度',
    `lat`          string COMMENT '纬度'
) COMMENT '活跃用户按天明细'
    PARTITIONED BY ( `dt` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_day/'
;
insert overwrite table dws_uv_detail_day partition (dt)
select mid_id,
       collect_set(user_id)[0]      user_id,
       collect_set(version_code)[0] version_code,
       collect_set(version_name)[0] version_name,
       collect_set(lang)[0]         lang,
       collect_set(source)[0]       source,
       collect_set(os)[0]           os,
       collect_set(area)[0]         area,
       collect_set(model)[0]        model,
       collect_set(brand)[0]        brand,
       collect_set(sdk_version)[0]  sdk_version,
       collect_set(gmail)[0]        gmail,
       collect_set(height_width)[0] height_width,
       collect_set(app_time)[0]     app_time,
       collect_set(network)[0]      network,
       collect_set(lng)[0]          lng,
       collect_set(lat)[0]          lat,
       '2019-12-19'
from dwd_start_log
where dt = '2019-12-19'
group by mid_id;

3.1.2 每周活跃设备明细

drop table if exists dws_uv_detail_wk;

create table dws_uv_detail_wk
(
    `mid_id`       string COMMENT '设备唯一标识',
    `user_id`      string COMMENT '用户标识',
    `version_code` string COMMENT '程序版本号',
    `version_name` string COMMENT '程序版本名',
    `lang`         string COMMENT '系统语言',
    `source`       string COMMENT '渠道号',
    `os`           string COMMENT '安卓系统版本',
    `area`         string COMMENT '区域',
    `model`        string COMMENT '手机型号',
    `brand`        string COMMENT '手机品牌',
    `sdk_version`  string COMMENT 'sdkVersion',
    `gmail`        string COMMENT 'gmail',
    `height_width` string COMMENT '屏幕宽高',
    `app_time`     string COMMENT '客户端日志产生时的时间',
    `network`      string COMMENT '网络模式',
    `lng`          string COMMENT '经度',
    `lat`          string COMMENT '纬度',
    `monday_date`  string COMMENT '周一日期',
    `sunday_date`  string COMMENT '周日日期'
) COMMENT '活跃用户按周明细'
    PARTITIONED BY (`wk_dt` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_wk/'
;
insert overwrite table dws_uv_detail_wk partition (wk_dt)
select mid_id,
       collect_set(user_id)[0]      user_id,
       collect_set(version_code)[0] version_code,
       collect_set(version_name)[0] version_name,
       collect_set(lang)[0]         lang,
       collect_set(source)[0]       source,
       collect_set(os)[0]           os,
       collect_set(area)[0]         area,
       collect_set(model)[0]        model,
       collect_set(brand)[0]        brand,
       collect_set(sdk_version)[0]  sdk_version,
       collect_set(gmail)[0]        gmail,
       collect_set(height_width)[0] height_width,
       collect_set(app_time)[0]     app_time,
       collect_set(network)[0]      network,
       collect_set(lng)[0]          lng,
       collect_set(lat)[0]          lat,
       date_add(next_day('2019-12-19', 'MO'), -7),
       date_add(next_day('2019-12-19', 'MO'), -1),
       concat(date_add(next_day('2019-12-19', 'MO'), -7), '_', date_add(next_day('2019-12-19', 'MO'), -1))
from dws_uv_detail_day
where dt >= date_add(next_day('2019-12-19', 'MO'), -7)
  and dt <= date_add(next_day('2019-12-19', 'MO'), -1)
group by mid_id;

3.1.3 每月活跃设备明细

drop table if exists dws_uv_detail_mn;

create external table dws_uv_detail_mn
(
    `mid_id`       string COMMENT '设备唯一标识',
    `user_id`      string COMMENT '用户标识',
    `version_code` string COMMENT '程序版本号',
    `version_name` string COMMENT '程序版本名',
    `lang`         string COMMENT '系统语言',
    `source`       string COMMENT '渠道号',
    `os`           string COMMENT '安卓系统版本',
    `area`         string COMMENT '区域',
    `model`        string COMMENT '手机型号',
    `brand`        string COMMENT '手机品牌',
    `sdk_version`  string COMMENT 'sdkVersion',
    `gmail`        string COMMENT 'gmail',
    `height_width` string COMMENT '屏幕宽高',
    `app_time`     string COMMENT '客户端日志产生时的时间',
    `network`      string COMMENT '网络模式',
    `lng`          string COMMENT '经度',
    `lat`          string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
    PARTITIONED BY (`mn` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_mn/'
;
insert overwrite table dws_uv_detail_mn partition (mn)
select mid_id,
       collect_set(user_id)[0]      user_id,
       collect_set(version_code)[0] version_code,
       collect_set(version_name)[0] version_name,
       collect_set(lang)[0]         lang,
       collect_set(source)[0]       source,
       collect_set(os)[0]           os,
       collect_set(area)[0]         area,
       collect_set(model)[0]        model,
       collect_set(brand)[0]        brand,
       collect_set(sdk_version)[0]  sdk_version,
       collect_set(gmail)[0]        gmail,
       collect_set(height_width)[0] height_width,
       collect_set(app_time)[0]     app_time,
       collect_set(network)[0]      network,
       collect_set(lng)[0]          lng,
       collect_set(lat)[0]          lat,
       date_format('2019-12-19', 'yyyy-MM')
from dws_uv_detail_day
where date_format(dt, 'yyyy-MM') = date_format('2019-12-19', 'yyyy-MM')
group by mid_id;

3.1.4 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

  insert overwrite table "$APP".dws_uv_detail_day partition(dt='$log_date')
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat
  from "$APP".dwd_start_log
  where dt='$log_date'  
  group by mid_id;


  insert  overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
   date_add(next_day('$log_date','MO'),-7),
   date_add(next_day('$log_date','SU'),-7),
   concat(date_add( next_day('$log_date','MO'),-7), '_' , date_add(next_day('$log_date','MO'),-1) 
  )
  from "$APP".dws_uv_detail_day 
  where dt>=date_add(next_day('$log_date','MO'),-7) and dt<=date_add(next_day('$log_date','MO'),-1) 
  group by mid_id,lang,gmail,app_time,lng,lat; 


  insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
    date_format('$log_date','yyyy-MM')
  from "$APP".dws_uv_detail_day
  where date_format(dt,'yyyy-MM') = date_format('$log_date','yyyy-MM')   
  group by mid_id,lang,gmail,app_time,lng,lat;
"

$hive -e "$sql"

3.2 用户新增

3.2.1 每日新增用户明细

drop table if exists `dws_new_mid_day`;
create table `dws_new_mid_day`
(
    `mid_id`       string COMMENT '设备唯一标识',
    `user_id`      string COMMENT '用户标识',
    `version_code` string COMMENT '程序版本号',
    `version_name` string COMMENT '程序版本名',
    `lang`         string COMMENT '系统语言',
    `source`       string COMMENT '渠道号',
    `os`           string COMMENT '安卓系统版本',
    `area`         string COMMENT '区域',
    `model`        string COMMENT '手机型号',
    `brand`        string COMMENT '手机品牌',
    `sdk_version`  string COMMENT 'sdkVersion',
    `gmail`        string COMMENT 'gmail',
    `height_width` string COMMENT '屏幕宽高',
    `app_time`     string COMMENT '客户端日志产生时的时间',
    `network`      string COMMENT '网络模式',
    `lng`          string COMMENT '经度',
    `lat`          string COMMENT '纬度',
    `create_date`  string comment '创建时间'
) COMMENT '每日新增设备信息'
    stored as parquet
    location '/warehouse/gmall/dws/dws_new_mid_day/';

在这里插入图片描述

insert into table dws_new_mid_day
select ud.mid_id,
       ud.user_id,
       ud.version_code,
       ud.version_name,
       ud.lang,
       ud.source,
       ud.os,
       ud.area,
       ud.model,
       ud.brand,
       ud.sdk_version,
       ud.gmail,
       ud.height_width,
       ud.app_time,
       ud.network,
       ud.lng,
       ud.lat,
       '2019-12-19'
from dws_uv_detail_day ud
         left join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
  and nm.mid_id is null;

3.2.2 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
set hive.exec.dynamic.partition.mode=nonstrict;
  insert into table "$APP".dws_new_mid_day
select ud.mid_id,
       ud.user_id,
       ud.version_code,
       ud.version_name,
       ud.lang,
       ud.source,
       ud.os,
       ud.area,
       ud.model,
       ud.brand,
       ud.sdk_version,
       ud.gmail,
       ud.height_width,
       ud.app_time,
       ud.network,
       ud.lng,
       ud.lat,
       '$log_date'
from "$APP".dws_uv_detail_day ud
         left join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
  and nm.mid_id is null;
"

$hive -e "$sql"

3.3 用户留存

3.3.0 留存概念

  1. 留存用户:
    某段时间内的新增用户(活跃用户),经过一段时间后, 又继续使用的被认作是留存用户
  2. 留存率
    留存用户占当时新增用户(活跃用户)的比例

3.3.1 每日留存用户明细

drop table if exists `dws_user_retention_day`;
create table `dws_user_retention_day`
(
    `mid_id`        string COMMENT '设备唯一标识',
    `user_id`       string COMMENT '用户标识',
    `version_code`  string COMMENT '程序版本号',
    `version_name`  string COMMENT '程序版本名',
    `lang`          string COMMENT '系统语言',
    `source`        string COMMENT '渠道号',
    `os`            string COMMENT '安卓系统版本',
    `area`          string COMMENT '区域',
    `model`         string COMMENT '手机型号',
    `brand`         string COMMENT '手机品牌',
    `sdk_version`   string COMMENT 'sdkVersion',
    `gmail`         string COMMENT 'gmail',
    `height_width`  string COMMENT '屏幕宽高',
    `app_time`      string COMMENT '客户端日志产生时的时间',
    `network`       string COMMENT '网络模式',
    `lng`           string COMMENT '经度',
    `lat`           string COMMENT '纬度',
    `create_date`   string comment '设备新增时间',
    `retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
    PARTITIONED BY ( `dt` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_user_retention_day/'
;
  • 导入数据(每天计算前一天的新用户访问留存明细, 昨天新增的且今天活跃)
insert overwrite table dws_user_retention_day partition (dt = "2019-12-19")
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       1 retention_day
from dws_uv_detail_day ud
         join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
  and nm.create_date = date_add('2019-12-19', -1);

3.3.2 1,2,3,n天留存用户明细

  • 前一天新增的且今天活跃, 前两天新增的且今天活跃,
  1. 表:
    新增表
    活跃表
  2. 条件
    (1) 新增表. 新增日期 = 当前日期 - n天
    nm.create_date = date_add('2019-12-19', -n)
    (2) 活跃表. 活跃日期 = 当前日期(今天活跃了)
    ud.dt = '2019-12-19'
    (3) 两表连接
    ud.mid_id = nm.mid_id
insert overwrite table dws_user_retention_day partition (dt = "2019-12-19")
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       1 retention_day
from dws_uv_detail_day ud
         join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
  and nm.create_date = date_add('2019-12-19', -1)

union all
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       2 retention_day
from dws_uv_detail_day ud
         join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
  and nm.create_date = date_add('2019-12-19', -2)

union all
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       3 retention_day
from dws_uv_detail_day ud
         join dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '2019-12-19'
  and nm.create_date = date_add('2019-12-19', -3);

3.3.3 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
set hive.exec.dynamic.partition.mode=nonstrict;
  insert overwrite table "$APP".dws_user_retention_day partition (dt = "$log_date")
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       1 retention_day
from "$APP".dws_uv_detail_day ud
         join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
  and nm.create_date = date_add('$log_date', -1)

union all
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       2 retention_day
from "$APP".dws_uv_detail_day ud
         join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
  and nm.create_date = date_add('$log_date', -2)

union all
select nm.mid_id,
       nm.user_id,
       nm.version_code,
       nm.version_name,
       nm.lang,
       nm.source,
       nm.os,
       nm.area,
       nm.model,
       nm.brand,
       nm.sdk_version,
       nm.gmail,
       nm.height_width,
       nm.app_time,
       nm.network,
       nm.lng,
       nm.lat,
       nm.create_date,
       3 retention_day
from "$APP".dws_uv_detail_day ud
         join "$APP".dws_new_mid_day nm on ud.mid_id = nm.mid_id
where ud.dt = '$log_date'
  and nm.create_date = date_add('$log_date', -3);
"

$hive -e "$sql"

3.4 小结

3.4.1 日期处理函数

1)date_format函数(根据格式整理日期)

hive (gmall)> select date_format('2019-02-10','yyyy-MM');
2019-02

2)date_add函数(加减日期)

hive (gmall)> select date_add('2019-02-10',-1);
2019-02-09
hive (gmall)> select date_add('2019-02-10',1);
2019-02-11

3)next_day函数
(1)取当前天的下一周的周一

hive (gmall)> select next_day('2019-02-12','MO')
2019-02-18

(2)取当前周的周一

hive (gmall)> select date_add(next_day('2019-02-12','MO'),-7);
2019-02-11

4)last_day函数(求当月最后一天日期)

hive (gmall)> select last_day('2019-02-10');
2019-02-28

第四章 ADS层

4.1 用户活跃

4.1.1活跃设备数

drop table if exists ads_uv_count;
create external table ads_uv_count
(
    `dt`          string COMMENT '统计日期',
    `day_count`   bigint COMMENT '当日用户数量',
    `wk_count`    bigint COMMENT '当周用户数量',
    `mn_count`    bigint COMMENT '当月用户数量',
    `is_weekend`  string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
    `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '每日活跃用户数量'
    stored as parquet
    location '/warehouse/gmall/ads/ads_uv_count_day/'
;
insert overwrite table ads_uv_count
select '2019-12-19' dt,
       daycount.ct,
       wkcount.ct,
       mncount.ct,
       if(date_add(next_day('2019-12-19', 'MO'), -1) = '2019-12-19', 'Y', 'N'),
       if(last_day('2019-12-19') = '2019-12-19', 'Y', 'N')
from (
         select '2019-12-19' dt,
                count(*)     ct
         from dws_uv_detail_day
         where dt = '2019-12-19'
     ) daycount
         join
     (
         select '2019-12-19' dt,
                count(*)     ct
         from dws_uv_detail_wk
         where wk_dt =
               concat(date_add(next_day('2019-12-19', 'MO'), -7), '_', date_add(next_day('2019-12-19', 'MO'), -1))
     ) wkcount on daycount.dt = wkcount.dt
         join
     (
         select '2019-12-19' dt,
                count(*)     ct
         from dws_uv_detail_mn
         where mn = date_format('2019-12-19', 'yyyy-MM')
     ) mncount on daycount.dt = mncount.dt
;

4.1.2 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
 set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table "$APP".ads_uv_count
select '$log_date' dt,
       daycount.ct,
       wkcount.ct,
       mncount.ct,
       if(date_add(next_day('$log_date', 'MO'), -1) = '$log_date', 'Y', 'N'),
       if(last_day('$log_date') = '$log_date', 'Y', 'N')
from (
         select '$log_date' dt,
                count(*)     ct
         from "$APP".dws_uv_detail_day
         where dt = '$log_date'
     ) daycount
         join
     (
         select '$log_date' dt,
                count(*)     ct
         from "$APP".dws_uv_detail_wk
         where wk_dt =
               concat(date_add(next_day('$log_date', 'MO'), -7), '_', date_add(next_day('$log_date', 'MO'), -1))
     ) wkcount on daycount.dt = wkcount.dt
         join
     (
         select '$log_date' dt,
                count(*)     ct
         from "$APP".dws_uv_detail_mn
         where mn = date_format('$log_date', 'yyyy-MM')
     ) mncount on daycount.dt = mncount.dt
;
"

$hive -e "$sql"

4.2 用户新增

4.2.1 每日新增用户数

drop table if exists `ads_new_mid_count`;
create table `ads_new_mid_count`
(
    `create_date`   string comment '创建时间',
    `new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
    row format delimited fields terminated by '\t'
    location '/warehouse/gmall/ads/ads_new_mid_count/';
insert into table ads_new_mid_count
select create_date, count(*)
from dws_new_mid_day
where create_date = '2019-12-19'
group by create_date;

4.2.2 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
 set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_new_mid_count
select create_date, count(*)
from "$APP".dws_new_mid_day
where create_date = '$log_date'
group by create_date;
"

$hive -e "$sql"

4.3 用户留存

4.3.1 留存用户数

drop table if exists `ads_user_retention_day_count`;
create table `ads_user_retention_day_count`
(
    `create_date`   string comment '设备新增日期',
    `retention_day` int comment '截止当前日期留存天数',
    retention_count bigint comment '留存数量'
) COMMENT '每日用户留存情况'
    stored as parquet
    location '/warehouse/gmall/ads/ads_user_retention_day_count/';
insert into table ads_user_retention_day_count
select create_date,
       retention_day,
       count(*) retention_count
from dws_user_retention_day
where dt = '2019-12-19'
group by create_date, retention_day;

4.3.2 留存用户率

drop table if exists `ads_user_retention_day_rate`;
create table `ads_user_retention_day_rate`
(
    `stat_date`       string comment '统计日期',
    `create_date`     string comment '设备新增日期',
    `retention_day`   int comment '截止当前日期留存天数',
    `retention_count` bigint comment '留存数量',
    `new_mid_count`   string comment '当日设备新增数量',
    `retention_ratio` decimal(10, 2) comment '留存率'
) COMMENT '每日用户留存情况'
    stored as parquet
    location '/warehouse/gmall/ads/ads_user_retention_day_count/';
insert into table ads_user_retention_day_rate
select '2019-12-19',
       ur.create_date,
       ur.retention_day,
       ur.retention_count,
       nc.new_mid_count,
       ur.retention_count / nc.new_mid_count * 100
from (
         select create_date,
                retention_day,
                count(*) retention_count
         from `dws_user_retention_day`
         where dt = '2019-12-19'
         group by create_date, retention_day
     ) ur
         join ads_new_mid_count nc on nc.create_date = ur.create_date;

4.3.3 脚本

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
 set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_user_retention_day_count
select create_date,
       retention_day,
       count(*) retention_count
from "$APP".dws_user_retention_day
where dt = '$log_date'
group by create_date, retention_day;

insert into table "$APP".ads_user_retention_day_rate
select '$log_date',
       ur.create_date,
       ur.retention_day,
       ur.retention_count,
       nc.new_mid_count,
       ur.retention_count / nc.new_mid_count * 100
from (
         select create_date,
                retention_day,
                count(*) retention_count
         from `"$APP".dws_user_retention_day`
         where dt = '$log_date'
         group by create_date, retention_day
     ) ur
         join "$APP".ads_new_mid_count nc on nc.create_date = ur.create_date;
"

$hive -e "$sql"

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!