流程图：

hive查询语法：

1.基本函数：

2.条件查询

3.join关联查询

准备数据
a.txt
name nmb
a,1
b,2
c,3
d,4

b.txt
name nick
a,xx
b,yy
d,zz
e,pp

创建表：

create table t_a(name string,nmb int)
row format delimited
fields terminated by ',';

create table t_b(name string,nick string)
row format delimited
fields terminated by ',';

导入数据：
load data local inpath '/root/a.txt' into table t_a;
load data local inpath '/root/b.txt' into table t_b;

各类join
1.【内连接】
--笛卡尔积：
select a.*,b.*
from t_a a (inner) join t_b b; 从连起来的表去查询
左表的每一条与右表的每一条都进行连接
表连接就是把所有的表进行关联，如何没有告知如何连接，那么就会全部进行关联

--指定join条件
select a.*,b.*
from t_a a join t_b b on a.name=b.name;

2.左外连接（左连接）

select a.*,b.*
from t_a a left outer join t_b b on a.name=b.name;
左表中的数据全部都在查询结果中，但是右边没有符合左表那一条记录，就会出现null

3.右外连接（右连接）
select a.*,b.*
from t_a a right outer join t_b b on a.name=b.name ;
右表中数据全部存在，右表中的记录在左表中没有对应的条件，则右表中对应左表那一行会出现null

4.全外连接
select a.*,b.*
from t_a full outer t_b b on a.name=b.name;
左表与右表中的数据根据on 条件全部保留，互相护法对应的使用null

5.左半连接（mysql中没有）
查询数据为左表，不能出现右表的数据，因此查询结果中去掉b
select a.*
from t_a a left semi join t_b b on a.name=b.name;

5.group by 分组聚合

--针对每一行进行运算
select ip,upper(url),access_time
from t_pv_log;

upper(url) 转换url为大写

--求每条url访问的总次数分组查询
select url,count(1) as cnt
from t_pv_log
group by url;

--求每个url对应ip的最大ip是多少
select url,max(ip)
from t_pv_log
group by url;

--求每个用户访问同一页面的所有记录中，时间最晚的一条
--出现两个判断条件时候，按两个条件进行分组查询
select url,ip,max(access_time)
from t_pv_log
group by url,ip;

--分组聚合函数综合示例：

--1）建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string) row format delimited fields terminated by ',';

--2)创建数据

--3）导入数据：
load data local inpath '/root/access.log.0101' into table t_access partition (dt='2017-1-1');
load data local inpath '/root/access.log.0102' into table t_access partition (dt='2017-1-2');
load data local inpath '/root/access.log.0103' into table t_access partition (dt='2017-1-3');

--4)查看表分区
show partitions t_access;

--5)求1号以后，每日天访问xxx页面的总访问次数，及访问者中ip地址最大的

select dt,count(1),max(ip)
from t_pv_log
where url='http://www.baidu.com' --where是对表进行条件筛选
group by dt
having dt>'2017-1-1'; --having是对分组后进行过滤

--为了显示url，可以group by后面添加url

select dt,url,count(1),max(ip)
from t_pv_log
where url='http://www.baidu.com' --where是对表进行条件筛选
group by dt，url
having dt>'2017-1-1'; --having是对分组后进行过滤

--或者添加一个聚合函数
select dt,max(url),count(1),max(ip)
from t_pv_log
where url='http://www.baidu.com' --where是对表进行条件筛选
group by dt
having dt>'2017-1-1'; --having是对分组后进行过滤

6.子查询

--运行错误提示
error 错误
while 当
compiling 编译过程中
statement 语句
faild 失败的
semantic 语法
exception 异常
line 行
table 表
not found 找不到

tiny 微小的

char 字符

Loading 加载

data 数据

from 从

file 文件

fire 开火，发射

recognize 识别

<EOF>没有正确结束

processing 运行时

execution 执行

准备数据

--求1号以后，每天每个页面的总访问次数，及访问者中ip地址最大的，且只查询出访问次数>2的记录
--方式1：
select dt,url,count(1)as cnts,max(ip)
from t_pv_log
where dt>'2017-1-1';
group by dt,url
having cnts >2;

--使用子查询
--方式2：
select dt,url,cnts,max_ip
from
(
select dt,url,count(1)as cnts,max(ip)as max_ip
from t_pv_log
where dt>'2017-1-1';
group by dt,url
) tmp
where tmp.cnts>2;

-------------------------------------------------

--hive中的数据类型
--1.数字类型
tiny
int
bigint
float
double
smallint

--2.日期类型
timestamp 时间戳
date 日期

--3.字符串类型
string

--4.混杂类型
boolean
binary

-------------------------------------

--5.复合类型

5.1 array数组类型

--5.复合类型
--1.array数组类型

数据：
战狼2，吴京：吴刚：龙母，2017-9-8

create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection iterms termianted by ':' --还需要告诉集合元素中的分隔符是哪种
;

--选择数组中某个值
select movie_name,array[0],first_show from t_movie;

--需求：查询电影中包含吴刚的电影

select movie_name，actors,first_show
from t_movie where array_contains(actors,'吴刚')
;

--查询结果

--需求：每部电影右多少主演

--主要为求数组的长度

select movie_name,actors,first_show,size(actors) as actors_numb
from t_movie;

5.2 map类型

建表映射：

create table t_family(id int,name string,family_members map<string,string>,age int)
row fromat delimited terminated by ','
collection iterms terminated by '#' --元素之间切分
map keys terminated by ':'; -- key与value之间的切分

导入数据

load data local inpath '/root/xxx.txt' into table t_faimly;

查询语句

--查询出每个人的父亲

select id,name,family_members["father"],age
from t_family

--查询出每个人的父亲姐妹
select id,name,family_members["father"],family_members["sisters"],age
from t_family

没有姐妹的查询结果为null

--查询每个人的亲属关系
--即查询map的key
select id,name,age,map_keys(family_members) as relations
from t_family;

--查出每个人亲人的名字
--即查询map的value
select id,name,age,map_values(family_members) as relations
from t_family;

--查询出每个人亲属的数量
select id,name,age,size(family_members)
from t_family;

--查出每个人有兄弟及相兄弟的名字
--函数嵌套函数
--方式一：一句话写完

select id,name,age ,family_members['bother']
from t_family where array_contains(map_keys(family_member),'bother');

--方式二：子查询

select id,name,family_menbers['bother']
from
(
select id,name,age,map_keys(family_menbers) as relation
from t_family) tmp where ayyar_contains(relation,'bother');

hive数据类型--struct

drop table [if exists] t_user;
create table t_user(id int,name string info struct<age:int,sex:string,addr:string>);
row format delimited fields terminated by ','
collection iterms terminated by ':'
;

--查询每个人的id，name，和addr

select id,name,info.addr

from t_user;

hive 函数

学习方法：可以使用一个常量来进行测试

select substr("abcdef",0,3) ;

对于hive的函数，可以查看hive函数手册

截取substr()函数

select ip,substr(url,1,21) from t_access;

--类型转换函数

把一种类型转换成另一种类型，相当于java中的强转

select cast("8"as int);

select cast("2019-1-8" as date);

select current_timestamp;--是一个时间常量
转换成日期
select cast(current_timestamp as date);--结果为2017-5-9

================================================================

create table t_full(id int,birthday string,salary string)
row format delimited fields termianted by ',';

--把生日转换为日期
--并且重新创建一个表改变其数据类型
create table t_full_new 
as
select id,cast(birthday as date) as bir,cast(salary as float) as salary from t_full;

--数学运算符

round 四舍五入

ceil或者ceiling 向上取整

floor 向下取整

abs 取绝对值

greatest 获取最大值

least 获取最小值

select round(3.5); --结果为4
select round(6.3); --结果为6
select round(5.1356,3);--结果为5.136   保留位数的四舍五入

select ceil(2.6);--结果为3
select floor(5.1);--结果为5
select abs(-2.1);--结果为2.1
select greatest(5,7,9);--结果为9
select least(6,8,7);--结果为6

==================================================

嵌套函数，先把薪资类型转为double，然后使用greatest()函数求出最大值
select greatest(cast(s1 as double),cast(s2 as double),cast(s3 as double))from t_full;

greatest()函数为某一行的最大值
max()为求多行中的最大值，聚合函数
min()

select max(age) from t1 groub by 'sex';

--字符串函数

--截取
substr(‘abacdef,2);--结果为bacdef
substr(’abacdef,1,3);--结果为aba

--拼接
concat(‘ab’,‘12’);--结果为ab12

--按分隔符拼接
concat_ws(".",‘192’,‘168’,‘46’);--结果为192.168.46

create table t_employ_ip(ip_seg1 string,ip_seg2 string,ip_seg3 string,ip_seg4 string,name string,sex string)    
row format delimited  fields termianted by ',';

--把ip合并后查询结果
select concat_ws(".",ip_seg1,ip_seg2,ip_seg3,ip_seg4),name,sex from t_employ_ip;,

--求字符串的长度
select length('192.169.46.21');--结果为13

--切分split

--切割 用的比较多 返回一个数组
select split('张三：18:beijing',':');--结果返回一个字符串数组
select split('张三：18:beijing',':')[1];--求出切分后的字符串数组中的年龄 18
select split('192.168.46.21','\\.');--点为正则表达式中的特殊字符，\也是特殊含义，因此需要两个\

--转换大写 upper()
select upper('abc');--结果为ABC

--时间函数

--unix时间戳转换为字符串
取当前时间的毫秒数时间戳
select unix_timestamp();

select from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss');--156881133转换为2018-5-3 12:06:08

--字符串转换为unix时间戳
select unix_timesatmp('2017/08/19 15:02:23','yyyy/MM/dd HH:mm:ss');--转换为毫秒数

--将字符串转换为日期date
select to_date('2018-05-25 15:23:58');--无需加时间格式，转换为2018-05-25

--行转列

select id,name,tmp.sub
from t_stu_subject
lateral view 
explode(sujects) tmp as sub;

select word ,count(1)
from
(select explode(split(sentence,' ')) as word from t_wc
) tmp
group by word;

来源：oschina

链接：https://my.oschina.net/u/4434424/blog/4311730

标签

Hive

BFE

Semantic

A2D

java

mysql

hive函数和HQL-《小牛学堂》

流程图：

--5.复合类型

5.1 array数组类型

5.2 map类型

hive 函数