絮絮叨叨
笔者常见的数据开发中,发现如果脚本需要产生中间表,或者说想要提升脚本性能,把这段中间表逻辑变为子查询,在人肉堆SQL生涯中,不外乎两种办法:
-
CREATE TABLE tmp.tmpxxxxx AS
优点:可以落物理表,验数时可追溯源头;
缺点:多一次落盘操作,讲白了说多IO,造成大量磁盘和网络开销
-
CACHE TABLE tmpxxxxx AS
优点:中间数据广播到每个节点,加快下次调用中间表读取速度
缺点:中间数据不可查,如果下游计算只调用一次,cache操作多一个stage浪费计算资源
讲重点-WITH是什么?
WITH AS短语,也叫做子查询部分(subquery factoring),可以定义一个SQL片断,该SQL片断会被整个SQL语句用到。可以使SQL语句的可读性更高,也可以在UNION ALL的不同部分,作为提供数据的部分。
对于UNION ALL,使用WITH AS定义了一个UNION ALL语句,当该片断被调用2次以上,优化器会自动将该WITH AS短语所获取的数据放入一个Temp表中。而提示meterialize则是强制将WITH AS短语的数据放入一个全局临时表中。很多查询通过该方式都可以提高速度
WITH有什么用?
提供一个子查询,供整个SQL调用,同时便于整个脚本维护
WITH 使用场景及用法
WTITH用法
WITH a AS(), b AS(), c AS () SELECT * FROM a,b,c;
1. WITH后面必须直接跟使用WITH的SQL语句(如select、insert、update等),否则,WITH将失效
即使用WITH 时,SQL不能出现分号,即如果使用WITH,在调用WITH生成子查询之前,均不能出现分号对SQL进行逻辑隔断
举个例子:
WITH a AS () DROP TABLE xxx; SELECT * FROM a;
以上这段代码在drop table xxx部分出现分号,即对整段SQL进行隔断操作,对WITH来说,如遇到隔断,WITH临时存储子查询将会失效
2. 如果WITH的表达式名称与某个数据表或视图重名,则紧跟在该WITH后面的SQL语句使用的仍然是WITH,当然,后面的SQL语句使用的就是数据表或视图了
`-- table1是一个实际存在的表
WITH
table1 as
(
SELECT * FROM persons WHERE age < 30
)
SELECT * FROM table1 – 使用了名为table1的公共表表达式
SELECT * FROM table1 – 使用了名为table1的数据表`
自引用 WITH 可以引用自身,也可以引用在同一 WITH 子句中预先定义的 WITH。不允许前向引用。
WITH 注意事项
不可以在WITH中使用以下语句
- COMPUTE 或 COMPUTE BY
- ORDER BY(除非指定了 TOP 子句)
- INTO
- 带有查询提示的 OPTION 子句
- FOR XML
- FOR BROWSE
如果将 WITH用在属于批处理的一部分的语句中,那么在它之前的语句必须以分号结尾,如下面的SQL所示:
DROP TABLE xxx; WITH a AS () SELECT * FROM a;
WITH优化案例
代码来源于灵犀项目-老代码子查询均用CACHE处理,优化后改为WITH
`set spark.sql.hive.mergeFiles=true;
ALTER TABLE {dyncobj.targetdb}.{dyncobj.targetable} DROP IF EXISTS PARTITION(dt=’{dyncobj.dimdt}’);
– 加工click_PV&click_UV
WITH tmp_table_click_message_box AS (
SELECT
event_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END AS messagebox_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END AS message_type_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[2] END AS message_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[3] END AS task_id
,GROUPING_ID() AS lvl
,COUNT(DISTINCT user_log_acct) AS click_uv
,COUNT(1) AS click_pv
FROM
adm.adm_s14_glb_mtu_click_di
WHERE
dt = ‘{dyncobj.dimdt}’
AND bs = ‘APP’
AND regexp_replace(app_version, ‘\\\.’, ‘’) >= 2140
AND regexp_replace(build_version, ‘\\\.’, ‘’) >= 2140
AND event_id in (‘THA_MyMessage_MessageBox’,‘THA_MessageCenter_Message’,‘THA_PushMessage_OpenMessage’)
GROUP BY
event_id – 16
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END – 8
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END – 4
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[2] END – 2
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[3] END – 1
GROUPING SETS(
(event_id), – 01111 15
(event_id, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END), – 00111 7
(event_id, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END), – 00011 3
(event_id, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[2] END), – 00001 1
(event_id, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END, CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’_’)[3] END) – 00010 2
) ),
– 加工引入订单及金额
tmp_table_click_message_box_ord AS (
SELECT
event_id
,messagebox_id
,message_type_id
,message_id
,task_id
,GROUPING_ID() AS lvl
,COUNT(DISTINCT CASE WHEN gmv_type=‘order’ THEN b.parent_sale_ord_id ELSE NULL END) AS parent_ord_nums
,COUNT(DISTINCT CASE WHEN gmv_type=‘paid’ THEN b.parent_sale_ord_id ELSE NULL END) AS parent_paid_nums
,COUNT(DISTINCT CASE WHEN gmv_type=‘order_paid’ THEN b.parent_sale_ord_id ELSE NULL END) AS parent_gmv_amount_nums
,COUNT(CASE WHEN gmv_type=‘order’ THEN sale_ord_id ELSE NULL END) AS son_ord_nums
,COUNT(CASE WHEN gmv_type=‘paid’ THEN sale_ord_id ELSE NULL END) AS son_paid_nums
,COUNT(CASE WHEN gmv_type=‘order_paid’ THEN sale_ord_id ELSE NULL END) AS son_gmv_amount_nums
,SUM(CASE WHEN gmv_type=‘order’ THEN gmv_amount ELSE 0 END) AS order_amount
,SUM(CASE WHEN gmv_type=‘paid’ THEN gmv_amount ELSE 0 END) AS paid_amount
,SUM(CASE WHEN gmv_type=‘order_paid’ THEN gmv_amount ELSE 0 END) AS order_paid_amount
,COUNT(DISTINCT CASE WHEN gmv_type=‘valid’ THEN b.user_log_acct ELSE NULL END) AS valid_order_uv
FROM
(SELECT
event_id
,report_time
,event_param
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END AS messagebox_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END AS message_type_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[2] END AS message_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[3] END AS task_id
,sale_ord_id AS parent_sale_ord_id
FROM
gdm.gdm_m14_thai_sku_all_opt_app
WHERE
dt = ‘{dyncobj.dimdt}’
AND regexp_replace(app_version, ‘\\\.’, ‘’) >= 2140
AND regexp_replace(build_version, ‘\\\.’, ‘’) >= 2140
AND event_id in (‘THA_MyMessage_MessageBox’,‘THA_MessageCenter_Message’,‘THA_PushMessage_OpenMessage’)
) a
LEFT OUTER JOIN
(
SELECT
sale_ord_id
,parent_sale_ord_id
,user_log_acct
,SUM(gmv_amount) AS gmv_amount
,gmv_type
FROM
adm.adm_s04_glb_trade_ord_det_sum
WHERE
dt = ‘{dyncobj.dimdt}’
AND gmv_type IN (‘order’, ‘paid’, ‘order_paid’,‘valid’)
GROUP BY
sale_ord_id
,parent_sale_ord_id
,gmv_type
,user_log_acct
) b
ON a.parent_sale_ord_id = b.parent_sale_ord_id
GROUP BY
event_id
,messagebox_id
,message_type_id
,message_id
,task_id
GROUPING SETS(
(event_id),
(event_id, messagebox_id),
(event_id, messagebox_id, message_type_id),
(event_id, messagebox_id, message_type_id, message_id),
(event_id, messagebox_id, message_type_id, task_id)
) ),
– 加工新用户
tmp_table_click_message_box_new_users AS (
SELECT
event_id
,messagebox_id
,message_type_id
,message_id
,task_id
,GROUPING_ID() AS lvl
,COUNT(DISTINCT b.user_log_acct) AS new_uv
FROM
(
SELECT
event_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN event_param ELSE SPLIT(event_param,’’)[0] END AS messagebox_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[1] END AS message_type_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[2] END AS message_id
,CASE WHEN event_id = ‘THA_MyMessage_MessageBox’ THEN NULL ELSE SPLIT(event_param,’’)[3] END AS task_id
,sale_ord_id AS parent_sale_ord_id
FROM
gdm.gdm_m14_thai_sku_all_opt_app
WHERE
dt = ‘{dyncobj.dimdt}’
AND regexp_replace(app_version, ‘\\\.’, ‘’) >= 2140
AND regexp_replace(build_version, ‘\\\.’, ‘’) >= 2140
AND event_id in (‘THA_MyMessage_MessageBox’,‘THA_MessageCenter_Message’,‘THA_PushMessage_OpenMessage’)
) a
LEFT OUTER JOIN
(
SELECT
parent_sale_ord_id
,user_log_acct
FROM
adm.adm_s04_glb_trade_ord_det_sum
WHERE
dt = ‘{dyncobj.dimdt}’
AND gmv_type = ‘valid’ – 取有效下单用户
) b
ON a.parent_sale_ord_id = b.parent_sale_ord_id
INNER JOIN
(
SELECT
user_log_acct
FROM
gdm.gdm_m01_glb_userinfo_basic_da
WHERE
dt = ‘{dyncobj.dimdt}’
AND SUBSTR(user_reg_tm,1,10) = ‘{dyncobj.dimdt}’ – 取当天注册新用户
) c
ON TRIM(LOWER(b.user_log_acct)) = TRIM(LOWER(c.user_log_acct))
GROUP BY
event_id
,messagebox_id
,message_type_id
,message_id
,task_id
GROUPING SETS(
(event_id),
(event_id, messagebox_id),
(event_id, messagebox_id, message_type_id),
(event_id, messagebox_id, message_type_id, message_id),
(event_id, messagebox_id, message_type_id, task_id)
) )
INSERT OVERWRITE TABLE {dyncobj.targetdb}.{dyncobj.targetable} PARTITION (dt=’{dyncobj.dimdt}’)
SELECT
a.event_id
,a.messagebox_id AS messagebox_id
,CASE WHEN a.messagebox_id = ‘1’ THEN ‘物流盒子’
WHEN a.messagebox_id = ‘2’ THEN ‘营销盒子’
WHEN a.messagebox_id = ‘3’ THEN ‘通知盒子’
WHEN a.messagebox_id = ‘4’ THEN ‘咚咚盒子’
ELSE NULL END AS messagebox_type
,a.message_type_id
,a.message_id
,a.task_id
,click_uv
,click_pv
,parent_ord_nums
,parent_paid_nums
,parent_gmv_amount_nums
,son_ord_nums
,son_paid_nums
,son_gmv_amount_nums
,order_amount
,paid_amount
,order_paid_amount
,valid_order_uv
,new_uv
,CASE WHEN a.lvl = 15 THEN 1
WHEN a.lvl = 7 THEN 2
WHEN a.lvl = 3 THEN 3
WHEN a.lvl = 1 THEN 4
WHEN a.lvl = 2 THEN 5
ELSE NULL END AS lvl
FROM
tmp_table_click_message_box a
LEFT OUTER JOIN tmp_table_click_message_box_ord b
ON
coalesce(a.task_id,‘kl999’) = coalesce(b.task_id,‘kl999’)
AND a.event_id = b.event_id
AND coalesce(a.message_id,‘kl999’) = coalesce(b.message_id,‘kl999’)
AND coalesce(a.message_type_id,‘kl999’) = coalesce(b.message_type_id,‘kl999’)
AND coalesce(a.messagebox_id,‘kl999’) = coalesce(b.messagebox_id,‘kl999’)
AND a.lvl = b.lvl
LEFT OUTER JOIN tmp_table_click_message_box_new_users c
ON
coalesce(a.task_id,‘kl999’) = coalesce(c.task_id,‘kl999’)
AND a.event_id = c.event_id
AND coalesce(a.message_id,‘kl999’) = coalesce(c.message_id,‘kl999’)
AND coalesce(a.message_type_id,‘kl999’) = coalesce(c.message_type_id,‘kl999’)
AND coalesce(a.messagebox_id,‘kl999’) = coalesce(c.messagebox_id,‘kl999’);
“”"`
代码不讲了,自己看
最后的最后
- 少落临时表,费时费力浪费IO
- 不要滥用CACHE,当然CACHE还是个好东西,注意CACHE打开方式
- WITH可弥补CACHE性能上的使用场景
- 还是那句话,SQL不是垒出来的!
来源:CSDN
作者:Cold丶kl
链接:https://blog.csdn.net/weixin_42792621/article/details/104451026