hive常用函数


判空

nvl

select nvl(salary,0) from emp;

coalesce

coalesce(a,b,c,’ddd’)
解析:

// scala
if(a != null){
	a
}else if(b != null ){
	b
}else if(c != null){
	c
}else 'ddd'

对集合的操作

将String 转换成Map

str_to_map()

str_to_map(text,delimited1,delimited2)
-- 解析:
-- delimited1是kv kv kv 之间的分隔符
-- delimited2是k与v之间的分隔符

判断集合中是否包含某元素

array_contains()

-- 这是数仓项目中,对于DWS层访客用户表的装载语句中的一个条件判断
-- 判断是否为新增访客
-- 将is_new字段放入set中去重,然后判断是否存在'0',如果存在则返回true,否者返回false
if(array_contains(collect_set(is_new),'0'),'0','1')

将数据放入结构体中

named_struct

with t1 as
         (select mid_id,
                 page_id,
                 count(*)         page_count,
                 sum(during_time) during_time
          from dwd_page_log
          where dt = '2020-06-14'
          group by mid_id, page_id)
select mid_id,
       collect_set(named_struct('page_id', page_id, 'page_count', page_count, 'during_time', during_time))
from t1;

时间函数

  • select date_format(‘2017-3-4’,’yyyy-mm-dd HH:mm:ss’);
  • select date_add(‘2017-6-29’,4);
    terminal: 2017-7-3
  • select date_add(‘2017-6-27’,-4);
    terminal: 2017-6-23
  • select datediff(‘2017-7-3’,’2017-6-29’);
    terminal: 4
  • select datediff(‘2017-6-29’,’2017-7-3’);
    terminal: -4
  • select next_day(‘2020-06-14’,’MO’)
    返回下一周周一的日期
  • select date_add(next_day(‘2020-06-14’,’MO’),-7);
    当前周周一的日期
  • select regexp_replace(‘2019/07/03’,’/‘,’-‘);
    terminal: 2019-07-03

条件判断

case语句

case when then else end

-- 格式:
case
 when finish_time is not null and refund_time is not null and date_add(date_format(finish_time,'yyyy-MM-dd'),7) <= '2020-6-14' then date_format(finish_time,'yyyy-MM-dd')
 when refund_finish_time is not null then date_format(refund_finish_time,'yyyy-MM-dd')
 when cancel_time is not null then date_format(cancel_time,'yyyy-MM-dd')
 when expire_time is not null then date_format(expire_time,'yyyy-MM-dd')
 else '9999-99-99'
end;

select sum(case sex when '男'  then 1 else 0 end) male_count,
sum(case sex when '女' then 1 else 0 end) female_count
from emp group by deptno order by deptno;

if语句

-- 这是数仓项目中,对于DWS层访客用户表的装载语句中的一个条件判断
-- 判断是否为新增访客
-- 将is_new字段放入set中去重,然后判断是否存在'0',如果存在则返回true,否者返回false
-- 然后放入if中进行判断给与相应返回值
if(array_contains(collect_set(is_new),'0'),'0','1')

with as

with
tmp1 as
(
 select * from t1
)
tmp2 as
(
select * from t2
)
tmp3 as
(
select * from t3
)
insert overwirte table t4
select *
from tmp1
left join tmp2
on tmp1.id = tmp2.id
left join tmp3
on coalesce(tmp1.id,tmp2.id) = tmp3.id
-- on nvl(tmp1.id,tmp2.id) = tmp3.id

concat//行转列

第一步 t1

select concat_sw(',',xz,xx) xz_xx
from 
emp;

第二步t2

select 
t1.xz_xx, concat_sw('|',collect_set(t1.name))
from t1
group by t1.xz_xx;

第三步:整合

select 
t1.xz_xx, concat_sw('|',collect_set(t1.name))
from
(
select concat_sw(',',xz,xx) xz_xx
from 
emp;
)    t1
group by t1.xz_xx;

列转行

explode 将hive中复杂的array 或 map 拆分

example

create table if not exists movie_info(
movie_name string,
category array<string>
)
row format delimited fields terminated by '\t'
collection items terminated by ','

load data local inpath '/opt/module/datas/movie.txt' into table into movie_info;

select 
movie,
category_name
from
movie_info lateral view explode(category) table_tmp as category_name;


select 
movie_name,
category_name
from
movie_info cateral view explode(category) table_tmp as catgory_name;

窗口函数

over():

指定分析函数工作窗口大小,随着行变化

current row

n preceding: 向前移动n行

n following: 向后移动N行

unbounded 启点

unbounded preceding
unbounded following

lag(col,n);

lead(col,n);

ntile(n) 把有序分区的行分发到指定数据的组中,各个组有编号,编号从1开始。对于每一个ntile返回此行所属的组的编号。

create table if not exists business(
name string,
order_date date,
cost double
)
row format delimited fields terminated by ','

load data local inpath '/opt/module/data/business'
into table business;

查询2017-04购买过商品的顾客及总人数

select name,count(*) over()
from business
where substring(orderdate,1,7) = '2017-04'
group by name;

查询顾客购买明细,及月购买总额。

select *,sum(cost) over(partitioned by month(order_date))
from
business;

文章作者: tzkTangXS
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 tzkTangXS !
  目录