# Presto
# 一、服务端安装
官方文档Deploying Presto (opens new window)
下载最新版本presto-server-0.247.tar.gz (opens new window)并解压
需要配置以下几项,详细解说见官方文档,这里只是本机快速配置
Node Properties: 每个节点的环境配置
JVM Config: 命令行的Java虚拟机选项
Config Properties: Presto服务器的配置
Catalog Properties: 连接器(数据源)的配置
# 1.1 Node Properties
文件位置etc/node.properties
node.environment=production
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
node.data-dir=/var/presto/data
1
2
3
2
3
# 1.2 JVM Config
文件位置etc/jvm.config
-server
-Xmx16G
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+UseGCOverheadLimit
-XX:+ExplicitGCInvokesConcurrent
-XX:+HeapDumpOnOutOfMemoryError
-XX:+ExitOnOutOfMemoryError
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
# 1.3 Config Properties
文件位置etc/config.properties
coordinator=true
node-scheduler.include-coordinator=true
http-server.http.port=8080
query.max-memory=5GB
query.max-memory-per-node=1GB
query.max-total-memory-per-node=2GB
discovery-server.enabled=true
discovery.uri=http://example.net:8080
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
# 1.4 Catalog Properties
文件位置etc/catalog/xx.properties
详细内容参考官方Connectors (opens new window)链接
# 1.4.1 Kudu Connector
文件位置 etc/catalog/kudu.properties
内容如下:
connector.name=kudu
kudu.client.master-addresses=localhost
kudu.schema-emulation.enabled=true
kudu.schema-emulation.prefix=
1
2
3
4
2
3
4
查询例子
- 创建表
CREATE TABLE kudu.default.users (
user_id int WITH (primary_key = true),
first_name varchar,
last_name varchar
) WITH (
partition_by_hash_columns = ARRAY['user_id'],
partition_by_hash_buckets = 2
);
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
- 查看表Schema
DESCRIBE kudu.default.users;
1
- 插入数据
INSERT INTO kudu.default.users VALUES (1, 'Donald', 'Duck'), (2, 'Mickey', 'Mouse');
1
- 查询数据
SELECT * FROM kudu.default.users;
1
# 1.5 运行
后台运行
bin/launcher start
1
前台运行
bin/launcher run
1
停止
bin/launcher stop
1
# 1.6 命令行客户端安装
下载最新的客户端[Jar]https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/0.247/presto-cli-0.247-executable.jar)包
下载好以后给予执行权限chmod +x
然后执行即可。--catalog
是连接器的名称
./presto --server localhost:8080 --catalog kudu --schema default
1
# 二、常用SQL
# 2.1 计算中位数
SELECT id, approx_percentile(value, 0.5)
FROM (
VALUES
(1, 2),
(1, 3),
(1, 7),
(1, 8),
(1, 4),
(2, 20),
(2, 30),
(2, 40)
) AS t(id, value)
GROUP BY id;
1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
# 2.2 String日期转Date
Date and Time Functions and Operators (opens new window)
select from_iso8601_date('2021-03-03') + interval '2' day
date_diff('second',date_parse(bcc.live_start_date,'%Y-%m-%d %H:%i:%s'),date_parse(bcc.live_end_date,'%Y-%m-%d %H:%i:%s'))
1
2
2
# 2.3 导出数据
sql="
select distinct username from odl_user_uc where length(username) >3 and not regexp_like(username,'\w+')
"
presto --server namenode1:8093 --catalog hive --schema default --execute "${sql}" --output-format CSV_HEADER > test
1
2
3
4
2
3
4
# 2.4 行专列
select
t8.*
,order_id
from tablexxx t8
CROSS JOIN UNNEST(split(first_case_order_id,',')) as order_id(order_id)
1
2
3
4
5
2
3
4
5
-- hive
SELECT student,score FROM tests LATERAL VIEW explode(scores)t AS score
-- presto
SELECT student, score
FROM tests
CROSS JOIN UNNEST(scores) AS t (score);
1
2
3
4
5
6
7
2
3
4
5
6
7
# 2.5 日期相减
date_diff('day',date('2021-01-01'),date('2022-01-10'))
1
# 2.6 行转列
select
a.classscheduleid
,array_join(array_distinct(ARRAY_AGG(a.teacherid)),',') as teacherid
,array_join(array_distinct(ARRAY_AGG(b.realname)),',') as realname
,array_join(array_distinct(ARRAY_AGG(b.nationality)),',') as nationality
from odl_new_schedule_teachers_class a
left join odl_teacherpersonalinfo_class b on a.teacherid=b.teacherid
where a.isdeleted=false
and a.classscheduleid=6799756
group by a.classscheduleid
classscheduleid | teacherid | realname | nationality
-----------------+--------------------+---------------------+-------------
6799756 | 104490086,38422508 | username1,username2 | JP,CN
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
# 2.7 时间戳转换为时间
presto:default> SELECT date_format(from_unixtime(1612345678), '%Y-%m-%d') as date;
date
------------
2021-02-03
presto:default> SELECT date_format(from_unixtime(1612345678), '%Y-%m-%d %H:%i:%s') as datetime;
datetime
---------------------
2021-02-03 17:47:58
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
# 三、Tips
# 3.1 防止因为和hive不兼容跑数的时候报错
一般都是meta错误
set session hive.collect_column_statistics_on_write = false;
1