# 查询

# 数据类型

官方文档 LanguageManual Types (opens new window)

# 复杂类型使用示例

CREATE TABLE movies(
    participants ARRAY<string>,
    release_dates MAP<string,timestamp>,
    studio_addr STRUICT<state:string,city:string,zip:string,streetnbr:int,streetname:string,unit:string>,
    complex_participants MAP<string,STRUCT<address:string,attributes MAP<string,string>>>
    misc UNIONTYPE<int,string,ARRAY<double>>
);
1
2
3
4
5
6
7
select movie_name,
  release_dates["USA"],
  studio_addr.zip,
  complex_participants["Leonardo Dicaprio"].attributes["fav_color"],
  misc
from movies;
1
2
3
4
5
6

# Array

创建包含Array类型的表

create table mobilephones (
    id string,
    title string,
    cost float,
    colors array<string>,
    screen_size array<float>
);

insert into table mobilephones
select 
    "redminote7", "Redmi Note 7", 300,
    array("white", "silver", "black"), array(float(4.5))
UNION ALL
select 
    "motoGplus", "Moto G Plus", 200, array("black", "gold"),
    array(float(4.5), float(5.5));

select * from mobilephones;

select id, colors from mobilephones;

select id, colors[0] from mobilephones;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

执行数组类型字段的分隔符并导入数据

create table mobilephones (
id string,
title string,
cost float,
colors array<string>,
screen_size array<float>
)
row format delimited fields terminated by ','
collection items terminated by '#';

load data local inpath 'mobilephones.csv'
into table mobilephones;
1
2
3
4
5
6
7
8
9
10
11
12

查询数组

SELECT name,subordinates[0] FROM employees;
1

collect_set (会去重) collect_list(不去重)

    select cookie_id,collect_set(ad_id) as orders
    from click_log
    group by cookie_id;

-- 输出
    cookie_id        orders
    11        ["ad_101","ad_104"]
    22       ["ad_104","ad_102","ad_103"]
1
2
3
4
5
6
7
8

impala中不支持数组,但是可以用group_concat函数达到同样的效果

select 
    cookie_id,group_concat(ad_id,'|') as  orders
from click_log
group by cookie_id;
1
2
3
4

LATERAL VIEW

这里catalogs是数组. 通过LATERAL VIEW 语句让数组横向展示.

select click.cookie_id,ad.catalog from click_log click
left outer join (
  select ad_ikd,catalog from ad_list LATERAL VIEW OUTER expolode(catalogs) t AS catalog
) ad
on (click.ad_id = ad.ad_id);

数组排序

select ad_id,sort_array(catalogs) from ad_list;

查询数组是否包含某值

select ad_id,catalogs from ad_list where array_contains(catalogs,'catalog1');

# Map

create table mobilephones (
    id string,
    title string,
    cost float,
    colors array<string>,
    screen_size array<float>,
    features map<string, boolean>
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

Edit the file to add features

load data local inpath 'mobilephones.csv'
into table mobilephones;

select id, features[' camera'] from mobilephones;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

创建Map

-- 使用 map 函数,格式: (key1, value1, key2, value2, ...)
select map('key1',1,'key2',2)

-- 使用 str_to_map 函数,格式:str_to_map(text[, delimiter1, delimiter2])
-- Delimiter1用来分割键值对,默认是逗号','
-- Delimiter2用来区分键和值,默认为冒号':'
select str_to_map('aaa:11&bbb:22', '&', ':')
1
2
3
4
5
6
7

取key,value: map_keysmap_values

使用: maplateral view

查看大小:size(Map<K.V>)

查看是否包含某个key:array_contains(a,'test') from t1

查找key中包含item8的行

select * from f_orders where array_contains(map_keys(items),'item8');

侧向显示items

select user_id,order_id,item,amount from f_orders LATERAL VIEW explode(items) t AS item,amount;

侧向展示,并显示在原来map中的位置,貌似只能用数组

select username, pos, ts, page_id from ts_int
lateral view posexplode(visits) t as pos, ts, page_id;

将两个字段之前相差的数字进行行转列

select  t.f1
    ,t.start_r - pe.i as seq_no 

from   (select 'ABC' as f1,62971 as start_r,62937 as end_r)                t 
    lateral view posexplode(split(space(start_r - end_r),' ')) pe as i,s
;

# Struct

create table mobilephones (
    id string,
    title string,
    cost float,
    colors array<string>,
    screen_size array<float>,
    features map<string, boolean>,
    information struct<battery:string,camera:string>
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

--Edit the file to add information

load data local inpath 'mobilephones.csv'
into table mobilephones;

select id, features, information
from mobilephones;
select id, features['camera'], information.battery
from mobilephones;
select id, features['camera'] as CameraPresent, information.battery
from mobilephones;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# 查询

# Grouping Sets,Cube,Rollup

Grouping Sets

    SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH CUBE
    --相当于
    SELECT a, b, SUM(c) FROM t1 GROUP BY a, b
    UNION ALL
    SELECT a, NULL, SUM(c) FROM t1 GROUP BY a
1
2
3
4
5
    SELECT a, b, SUM(c) FROM t1 GROUP BY a, b GROUPING SETS (a,b,())
    --相当于
    SELECT a, NULL, SUM(c) FROM t1 GROUP BY a
    UNION ALL
    SELECT NULL, b, SUM(c) FROM t1 GROUP BY b
    UNION ALL
    SELECT NULL, NULL, SUM(c) FROM t1
1
2
3
4
5
6
7

Cube

    SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH CUBE
    --相当于
    SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b, c GROUPING SETS
    ((a,b,c),(a,b),(b,c),(a,c),a,b,c,())
1
2
3
4

Rollup

    SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b WITH ROLLUP
    --相当于
    SELECT a, b, c, SUM(d) FROM t1 GROUP BY a, b, c GROUPING SETS
    ((a,b,c),(a,b),a,())
1
2
3
4

# 显示函数

SHOW FUNCTIONS;
DESCRIBE FUNCTION length;

# Lateral View

Lateral View通过UDTF函数作为输入,然后提供组合的查询结果。

--语法
SELECT a,b,columnAlias
FROM baseTable
LATERAL VIEW UDTF(expression) tableAlias AS columnAlias;

--例子
SELECT a,b,col1,col2
FROM baseTable
LATERAL VIEW UDTF(x )t1 AS col1
LATERAL VIEW UDTF(col1)52 AS col2;
1
2
3
4
5
6
7
8
9
10

# RLIKE 语句

RLIKE可以让我们可以在Hive中使用Java正则表达式进行查询。

SELECT name,address.street
FROM employees WHERE address.street RLIKE '.*(Chicago|Ontario).*';
1
2

通过正则选择字段

SELECT语句支持通过正则表达式选择字段,用的是Java的正则语法,可以通过该网站进行正则的验证http://www.fileformat.info/tool/regex.htm (opens new window),以下语句表示选择除ds和hr之外的所有字段。

SELECT `(ds|hr)?+.+` FROM sales
1

TIP

实际应用中不建议使用,维护和理解都会有问题。

# JOIN

HIVE的JOIN类型有很多,Inner Join、LEFT OUTER Join、RIGHT OUTER Join、FULL OUTER Join (opens new window)、LEFT SEMI Join、Cartesian Product Join、Map-side Join。挑几个说下。

# LEFT SEMI JOIN

返回符合ON谓词条件的左边表的记录。比Inner JOIN效率高

SELECT a.val FROM a LEFT SEMI JOIN b ON (a.key = b.key);

--与下面语句等价
SELECT a.val FROM a WHERE a.key IN (SELECT b.key FROM b) - Not Supported
SELECT a.val FROM a WHERE EXISTS (SELECT 1 FROM b WHERE b.key = a.key) - Not Supported
1
2
3
4
5

# Cartesian Product Join

左右两边笛卡尔积

SELECT * FROM CUSTOMERS JOIN ORDERS;
1

# Map-side Join

根据mapjoin的计算原理,MAPJION会把小表全部读入内存中,在map阶段直接拿另外一个表的数据和内存中表数据做匹配。

SELECT /*+MAPJOIN(o)*/ c.ID,c.NAME,o.AMOUNT
FROM CUSTOMERS c JOIN ORDERS o
ON (c.ID=o.CUSTOMER_ID)
1
2
3

# 其他

# Job合并文件

set hive.merge.smallfiles.avgsize=256000000;当输出平均大小小于该值,启用新的job合并文件。
set hive.merge.size.per.task = 64000000;合并后的文件大小。

# 限制用户查询

设置以下选项用户查询数据的时候必须指定分区和返回数量

配置文件中

<property>
  <name>hive.mapred.mode</name>
  <value>strict</value>
</property>

或者

set hive.mapred.mode=strict;

# 压缩

设置压缩

-- SEQUENCEFILE
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapred.output.compression.type=BLOCK;
select count(1) from odl_order_eb_snappy;
-- ORC
set hive.exec.orc.default.compress=SNAPPY
1
2
3
4
5
6
7
8

或者

<property>
  <name>hive.exec.compress.output</name>
  <value>true</value>
</property>
<property>
  <name>mapred.output.compression.codec</name>
  <value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
  <name>mapred.output.compression.type</name>
  <value>BLOCK</value>
</property>
更新时间: 9/28/2021, 6:54:05 PM