# 函数

# 一、常用函数

# 生成随机数

SELECT regexp_replace(reflect('java.util.UUID','randomUUID'), '-', '') as row_num
1

# COALESCE 返回第一个非空值

COALESCE(t2.category,1) category

# 数据保护 使用translate函数

select user_id,birthday,translate(birthday,k'0123456789','1234567890') from d_users;

# 显示当前时间

select from_unixtime(unix_timestamp())

# 时间转换

regexp_replace(from_unixtime(UNIX_TIMESTAMP(paydate,'mmm/dd/yyyy hh:mm:ss a')),'-','/')
1

# 二、窗口函数

  • Lead Lead的行数可选,若没有指定则为1行.如果当前行的下一行超出窗口结尾边界则返回Null.
  • Lag Lag的行数可选,若没有指定则为1行.如果当前行的前一行已经超出窗口开始的边界则返回NULL.
  • FIRST_VALUE
  • LAST_VALUE
  • COUNT,SUM,MIN,MAX,AVG 可用于Over字句的聚合函数
  • RANK,ROW_NUMBER,DENSE_RANK,CUME_DIST,PERCENT_RANK,NTILE 分析函数

# Lag

该函数返回指定列前一行的值.你可以用一个整数来指定想要获取行的偏移位置,默认为1:

select ticker,date_,close,lag(close,1) over(partition by ticker) as yesterday_price from acadgild.stocks
1

这里使用lag显示了昨日的收盘价,lag域over函数组合使用,在over函数中你可以使用partition或order .

# Lead

该函数返回指定列后一行的值.你可以用一个整数来指定想要获取行的偏移位置,默认为1.这里用Lead来比较当天的收盘价与下一天的收盘价:

select ticker,date_,close,case(lead(close,1) over(partition by ticker)-close)>0 when true then "higher" when false then "lesser" end as Changes from acadgild.stocks
1

找出2015-12-01日所有用户对每一章节的阅读时间。

SELECT userid, bookid, chapterid, end_time – start_time as read_time
FROM
(
    SELECT userid, bookid, chapterid, log_time as start_time,
    lead(log_time,1,null) over(partition by userid, bookid order by log_time) as end_time
    FROM user_read_log where pt=2015-12-01) t;
1
2
3
4
5
6
7

# FIRST_VALUE

返回窗口中的第一行:

select ticker,first_value(high) over(partition by ticker) as first_high from acadgild.stocks
1

# LAST_VALUE

与FIRST_VALUE相反,它返回窗口最后一行:

select ticker,last_value(high) over(partition by ticker) as first_high from acadgild.stocks
1

# Count

根据窗口(over子句)范围进行计数:

select ticker,count(ticker) over(partition by ticker) as cnt from acadgild.stocks
1

每个partition根据ticker进行计数

# Sum

窗口(over子句)中的值进行总计计算,下面的查询对ticker进行分区,然后汇总每隔分区的收盘价.

select ticker,sum(close) over(partition by ticker) as total from acadgild.stocks
1

如果你要对volume_for_the_day进行汇总统计,则可以使用以下查询

select ticker,date_,volume_for_the_day,sum(volume_for_the_day) over(partition by ticker order by date_) as running_total from acadgild.stocks
1
SELECT 
    cookieid,  
    createtime,  
    pv,  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1   
    SUM(pv) OVER(PARTITION BY cookieid) AS pv3, --分组内所有行  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,  --当前行+往前3行  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,  --当前行+往前3行+往后1行  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6  ---当前行+往后所有行    
FROM bihell;  
   
cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6   
-----------------------------------------------------------------------------  
cookie1  2015-04-10      1       1       1       26      1       6       26  
cookie1  2015-04-11      5       6       6       26      6       13      25  
cookie1  2015-04-12      7       13      13      26      13      16      20  
cookie1  2015-04-13      3       16      16      26      16      18      13  
cookie1  2015-04-14      2       18      18      26      17      21      10  
cookie1  2015-04-15      4       22      22      26      16      20      8  
cookie1  2015-04-16      4       26      26      26      13      13      4  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

显示每行的比例

如果我们想知道volume_for_the_day在总volume的百分比要怎么做呢:

select ticker,date_,volume_for_the_day,(volume_for_the_day*100/(sum(volume_for_the_day) over(partition by ticker))) from acadgild.stocks
1

# Min

返回窗口(over子句)中最小的数值.

select ticker, min(close) over(partition by ticker) as minimum from acadgild.stocks
1

# Max

返回窗口(over子句)中最大的数值.

select ticker, max(close) over(partition by ticker) as maximum from acadgild.stocks
1

# AVG

返回窗口(over子句)中的平均值.

select ticker, avg(close) over(partition by ticker) as maximum from acadgild.stocks
1

# Rank

根据窗口的over子句,进行数据排序,如果两个值相同则获得相同的序号,而接下来一个值(序号)会直接过滤. 原文中的示例不好,所以替换掉了...

 select a1,a2,rank () over (order by a1) rank from test;
1
A1 A2 RANK
---------- ---------- ----------
1 3 1
2 4 2
3 2 3
3 5 3
4 2 5

# Row_number

根据窗口的over子句,进行数据排序,连续的序号,不管是否有相同值

select ticker,close,row_number() over(partition by ticker order by close) as num from acadgild.stocks
1

# Dense_rank

与rank类似但是不会跳过值

select a1,a2, dense_rank () over (order by a1) rank from test;
1
A1 A2 RANK
---------- ---------- ----------
1 3 1
2 4 2
3 2 3
3 5 3
4 2 4

# Cume_dist

返回累计分布(cumulative distribution),返回值范围为0~1.比如一共有10个值,那么第一个的值为1/10,第二个为2/10 ...第十个10/10.

select ticker,cume_dist() over(partition by ticker order by close) as cummulative from acadgild.stocks
1

# Percent_rank

返回百分比排序.计算规则为 (序列-1)/(总行数 – 1). 如果返回结果只有一行的话,那么percent_rank的结果就是0

select ticker,close,percent_rank() over(partition by ticker order by close) as closing from acadgild.stocks
1

# Ntile

对返回的数据进行分桶(bucket), 如果你指定Ntile(5)那么它会创建5个桶.然后前20%的记录分到第一个桶.以此类推最后20%分到第5个.

select ticker,ntile(5) over(partition by ticker order by close ) as bucket from acadgild.stocks
1

# 三、自定义函数

当 Hive 提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义 函数(UDF:user-defined function)。官方文档 (opens new window)

# 3.1 自定义 UDF 函数

  1. 需求
    自定义一个 UDF 实现计算给定字符串的长度
  2. 创建Maven工程Hive
  3. 导入依赖
    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>3.1.2</version>
        </dependency>
    </dependencies>
1
2
3
4
5
6
7
  1. 创建类MyStringLength

code/java/Hive/udf/src/main/java/udf/MyStringLength.java

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/**
 * @author haseochen
 */
public class MyStringLength extends GenericUDF {

    /**
     *
     * @param arguments 输入参数类型的鉴别器对象
     * @return 返回值类型的鉴别器对象
     * @throws UDFArgumentException
     */
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {


        // 判断输入参数的个数
        if (arguments.length != 1) {
            throw new UDFArgumentLengthException("Input Args Length Error!!!");
        }
        // 判断输入参数的类型
        if (!arguments[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)
        ) {
            throw new UDFArgumentTypeException(0, "Input Args Type Error!!!");

        }
        //函数本身返回值为 int,需要返回 int 类型的鉴别器对象
        return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
    }

    /**
     * 函数的逻辑处理
     * @param arguments 输入的参数
     * @return 返回值
     * @throws HiveException
     */
    @Override
    public Object evaluate(DeferredObject[] arguments) throws
            HiveException {
        if(arguments[0].get() == null){
            return 0;
        }
        return arguments[0].get().toString().length();
    }
    @Override
    public String getDisplayString(String[] children) {
        return ""; }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  1. 打包然后上传到服务器
  2. 将 jar 包添加到 hive 的 classpath
hive (default)> add jar ./udf-1.0-SNAPSHOT.jar
Added [./udf-1.0-SNAPSHOT.jar] to class path
Added resources: [./udf-1.0-SNAPSHOT.jar]
1
2
3
  1. 创建临时函数与开发好的 java class 关联
hive (default)> create temporary function my_len as "udf.MyStringLength";
1
  1. 使用函数
hive (default)> select my_len('test') from test;
OK
_c0
4
1
2
3
4

# 3.2 自定义UDTF函数

code/java/Hive/udf/src/main/java/udtf/MyUDTF.java

# 3.3 自定义UDTF炸裂出两个列

``

更新时间: 9/28/2021, 6:54:05 PM