# 函数
# 一、常用函数
# 生成随机数
SELECT regexp_replace(reflect('java.util.UUID','randomUUID'), '-', '') as row_num
# COALESCE 返回第一个非空值
COALESCE(t2.category,1) category
# 数据保护 使用translate函数
select user_id,birthday,translate(birthday,k'0123456789','1234567890') from d_users;
# 显示当前时间
select from_unixtime(unix_timestamp())
# 时间转换
regexp_replace(from_unixtime(UNIX_TIMESTAMP(paydate,'mmm/dd/yyyy hh:mm:ss a')),'-','/')
# 二、窗口函数
- Lead Lead的行数可选,若没有指定则为1行.如果当前行的下一行超出窗口结尾边界则返回Null.
- Lag Lag的行数可选,若没有指定则为1行.如果当前行的前一行已经超出窗口开始的边界则返回NULL.
- FIRST_VALUE
- LAST_VALUE
- COUNT,SUM,MIN,MAX,AVG 可用于Over字句的聚合函数
- RANK,ROW_NUMBER,DENSE_RANK,CUME_DIST,PERCENT_RANK,NTILE 分析函数
# Lag
该函数返回指定列前一行的值.你可以用一个整数来指定想要获取行的偏移位置,默认为1:
select ticker,date_,close,lag(close,1) over(partition by ticker) as yesterday_price from acadgild.stocks
这里使用lag显示了昨日的收盘价,lag域over函数组合使用,在over函数中你可以使用partition或order .
# Lead
该函数返回指定列后一行的值.你可以用一个整数来指定想要获取行的偏移位置,默认为1.这里用Lead来比较当天的收盘价与下一天的收盘价:
select ticker,date_,close,case(lead(close,1) over(partition by ticker)-close)>0 when true then "higher" when false then "lesser" end as Changes from acadgild.stocks
找出2015-12-01日所有用户对每一章节的阅读时间。
SELECT userid, bookid, chapterid, end_time – start_time as read_time
FROM
(
SELECT userid, bookid, chapterid, log_time as start_time,
lead(log_time,1,null) over(partition by userid, bookid order by log_time) as end_time
FROM user_read_log where pt=’2015-12-01’
) t;
2
3
4
5
6
7
# FIRST_VALUE
返回窗口中的第一行:
select ticker,first_value(high) over(partition by ticker) as first_high from acadgild.stocks
# LAST_VALUE
与FIRST_VALUE相反,它返回窗口最后一行:
select ticker,last_value(high) over(partition by ticker) as first_high from acadgild.stocks
# Count
根据窗口(over子句)范围进行计数:
select ticker,count(ticker) over(partition by ticker) as cnt from acadgild.stocks
每个partition根据ticker进行计数
# Sum
窗口(over子句)中的值进行总计计算,下面的查询对ticker进行分区,然后汇总每隔分区的收盘价.
select ticker,sum(close) over(partition by ticker) as total from acadgild.stocks
如果你要对volume_for_the_day进行汇总统计,则可以使用以下查询
select ticker,date_,volume_for_the_day,sum(volume_for_the_day) over(partition by ticker order by date_) as running_total from acadgild.stocks
SELECT
cookieid,
createtime,
pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1
SUM(pv) OVER(PARTITION BY cookieid) AS pv3, --分组内所有行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4, --当前行+往前3行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5, --当前行+往前3行+往后1行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6 ---当前行+往后所有行
FROM bihell;
cookieid createtime pv pv1 pv2 pv3 pv4 pv5 pv6
-----------------------------------------------------------------------------
cookie1 2015-04-10 1 1 1 26 1 6 26
cookie1 2015-04-11 5 6 6 26 6 13 25
cookie1 2015-04-12 7 13 13 26 13 16 20
cookie1 2015-04-13 3 16 16 26 16 18 13
cookie1 2015-04-14 2 18 18 26 17 21 10
cookie1 2015-04-15 4 22 22 26 16 20 8
cookie1 2015-04-16 4 26 26 26 13 13 4
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
显示每行的比例
如果我们想知道volume_for_the_day在总volume的百分比要怎么做呢:
select ticker,date_,volume_for_the_day,(volume_for_the_day*100/(sum(volume_for_the_day) over(partition by ticker))) from acadgild.stocks
# Min
返回窗口(over子句)中最小的数值.
select ticker, min(close) over(partition by ticker) as minimum from acadgild.stocks
# Max
返回窗口(over子句)中最大的数值.
select ticker, max(close) over(partition by ticker) as maximum from acadgild.stocks
# AVG
返回窗口(over子句)中的平均值.
select ticker, avg(close) over(partition by ticker) as maximum from acadgild.stocks
# Rank
根据窗口的over子句,进行数据排序,如果两个值相同则获得相同的序号,而接下来一个值(序号)会直接过滤. 原文中的示例不好,所以替换掉了...
select a1,a2,rank () over (order by a1) rank from test;
A1 A2 RANK
---------- ---------- ----------
1 3 1
2 4 2
3 2 3
3 5 3
4 2 5
# Row_number
根据窗口的over子句,进行数据排序,连续的序号,不管是否有相同值
select ticker,close,row_number() over(partition by ticker order by close) as num from acadgild.stocks
# Dense_rank
与rank类似但是不会跳过值
select a1,a2, dense_rank () over (order by a1) rank from test;
A1 A2 RANK
---------- ---------- ----------
1 3 1
2 4 2
3 2 3
3 5 3
4 2 4
# Cume_dist
返回累计分布(cumulative distribution),返回值范围为0~1.比如一共有10个值,那么第一个的值为1/10,第二个为2/10 ...第十个10/10.
select ticker,cume_dist() over(partition by ticker order by close) as cummulative from acadgild.stocks
# Percent_rank
返回百分比排序.计算规则为 (序列-1)/(总行数 – 1). 如果返回结果只有一行的话,那么percent_rank的结果就是0
select ticker,close,percent_rank() over(partition by ticker order by close) as closing from acadgild.stocks
# Ntile
对返回的数据进行分桶(bucket), 如果你指定Ntile(5)那么它会创建5个桶.然后前20%的记录分到第一个桶.以此类推最后20%分到第5个.
select ticker,ntile(5) over(partition by ticker order by close ) as bucket from acadgild.stocks
# 三、自定义函数
当 Hive 提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义 函数(UDF:user-defined function)。官方文档 (opens new window)
# 3.1 自定义 UDF 函数
- 需求
自定义一个 UDF 实现计算给定字符串的长度 - 创建Maven工程Hive
- 导入依赖
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
2
3
4
5
6
7
- 创建类
MyStringLength
code/java/Hive/udf/src/main/java/udf/MyStringLength.java
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* @author haseochen
*/
public class MyStringLength extends GenericUDF {
/**
*
* @param arguments 输入参数类型的鉴别器对象
* @return 返回值类型的鉴别器对象
* @throws UDFArgumentException
*/
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// 判断输入参数的个数
if (arguments.length != 1) {
throw new UDFArgumentLengthException("Input Args Length Error!!!");
}
// 判断输入参数的类型
if (!arguments[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)
) {
throw new UDFArgumentTypeException(0, "Input Args Type Error!!!");
}
//函数本身返回值为 int,需要返回 int 类型的鉴别器对象
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/**
* 函数的逻辑处理
* @param arguments 输入的参数
* @return 返回值
* @throws HiveException
*/
@Override
public Object evaluate(DeferredObject[] arguments) throws
HiveException {
if(arguments[0].get() == null){
return 0;
}
return arguments[0].get().toString().length();
}
@Override
public String getDisplayString(String[] children) {
return ""; }
}
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
- 打包然后上传到服务器
- 将 jar 包添加到 hive 的 classpath
hive (default)> add jar ./udf-1.0-SNAPSHOT.jar
Added [./udf-1.0-SNAPSHOT.jar] to class path
Added resources: [./udf-1.0-SNAPSHOT.jar]
2
3
- 创建临时函数与开发好的 java class 关联
hive (default)> create temporary function my_len as "udf.MyStringLength";
- 使用函数
hive (default)> select my_len('test') from test;
OK
_c0
4
2
3
4
# 3.2 自定义UDTF函数
code/java/Hive/udf/src/main/java/udtf/MyUDTF.java
# 3.3 自定义UDTF炸裂出两个列
``