# Presto

# 一、服务端安装

官方文档Deploying Presto (opens new window)

下载最新版本presto-server-0.247.tar.gz (opens new window)并解压

需要配置以下几项,详细解说见官方文档,这里只是本机快速配置

  • Node Properties: 每个节点的环境配置

  • JVM Config: 命令行的Java虚拟机选项

  • Config Properties: Presto服务器的配置

  • Catalog Properties: 连接器(数据源)的配置

# 1.1 Node Properties

文件位置etc/node.properties

node.environment=production
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
node.data-dir=/var/presto/data
1
2
3

# 1.2 JVM Config

文件位置etc/jvm.config

-server
-Xmx16G
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+UseGCOverheadLimit
-XX:+ExplicitGCInvokesConcurrent
-XX:+HeapDumpOnOutOfMemoryError
-XX:+ExitOnOutOfMemoryError
1
2
3
4
5
6
7
8

# 1.3 Config Properties

文件位置etc/config.properties

coordinator=true
node-scheduler.include-coordinator=true
http-server.http.port=8080
query.max-memory=5GB
query.max-memory-per-node=1GB
query.max-total-memory-per-node=2GB
discovery-server.enabled=true
discovery.uri=http://example.net:8080
1
2
3
4
5
6
7
8

# 1.4 Catalog Properties

文件位置etc/catalog/xx.properties

详细内容参考官方Connectors (opens new window)链接

# 1.4.1 Kudu Connector

文件位置 etc/catalog/kudu.properties

内容如下:

connector.name=kudu
kudu.client.master-addresses=localhost
kudu.schema-emulation.enabled=true
kudu.schema-emulation.prefix=
1
2
3
4

查询例子

  1. 创建表
CREATE TABLE kudu.default.users (
  user_id int WITH (primary_key = true),
  first_name varchar,
  last_name varchar
) WITH (
  partition_by_hash_columns = ARRAY['user_id'],
  partition_by_hash_buckets = 2
);
1
2
3
4
5
6
7
8
  1. 查看表Schema
DESCRIBE kudu.default.users;
1
  1. 插入数据
INSERT INTO kudu.default.users VALUES (1, 'Donald', 'Duck'), (2, 'Mickey', 'Mouse');
1
  1. 查询数据
SELECT * FROM kudu.default.users;
1

# 1.5 运行

后台运行

bin/launcher start
1

前台运行

bin/launcher run
1

停止

bin/launcher stop
1

# 1.6 命令行客户端安装

下载最新的客户端[Jar]https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/0.247/presto-cli-0.247-executable.jar)包

下载好以后给予执行权限chmod +x 然后执行即可。--catalog 是连接器的名称

./presto --server localhost:8080 --catalog kudu --schema default
1

# 二、常用SQL

# 2.1 计算中位数

SELECT id, approx_percentile(value, 0.5)
FROM (
    VALUES
        (1, 2),
        (1, 3),
        (1, 7),
        (1, 8),
        (1, 4),
        (2, 20),
        (2, 30),
        (2, 40)
) AS t(id, value)
GROUP BY id;
1
2
3
4
5
6
7
8
9
10
11
12
13

# 2.2 String日期转Date

Date and Time Functions and Operators (opens new window)

select from_iso8601_date('2021-03-03') + interval '2' day
date_diff('second',date_parse(bcc.live_start_date,'%Y-%m-%d %H:%i:%s'),date_parse(bcc.live_end_date,'%Y-%m-%d %H:%i:%s'))
1
2

# 2.3 导出数据

sql="
select distinct username from odl_user_uc where length(username) >3 and not regexp_like(username,'\w+')
"
presto --server namenode1:8093 --catalog hive --schema default --execute "${sql}" --output-format CSV_HEADER > test
1
2
3
4

# 2.4 行专列

    select 
        t8.*
        ,order_id
    from tablexxx t8
    CROSS JOIN UNNEST(split(first_case_order_id,','))  as order_id(order_id)
1
2
3
4
5
-- hive 
SELECT student,score FROM tests LATERAL VIEW explode(scores)t AS score

-- presto
SELECT student, score
FROM tests
CROSS JOIN UNNEST(scores) AS t (score);
1
2
3
4
5
6
7

# 2.5 日期相减

date_diff('day',date('2021-01-01'),date('2022-01-10'))
1

# 2.6 行转列

select 
    a.classscheduleid
    ,array_join(array_distinct(ARRAY_AGG(a.teacherid)),',') as teacherid
    ,array_join(array_distinct(ARRAY_AGG(b.realname)),',') as realname
    ,array_join(array_distinct(ARRAY_AGG(b.nationality)),',') as nationality
from odl_new_schedule_teachers_class a
left join odl_teacherpersonalinfo_class b on a.teacherid=b.teacherid
where a.isdeleted=false
and a.classscheduleid=6799756
group by a.classscheduleid

 classscheduleid |     teacherid      |      realname       | nationality
-----------------+--------------------+---------------------+-------------
         6799756 | 104490086,38422508 | username1,username2 | JP,CN
1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 2.7 时间戳转换为时间

presto:default> SELECT date_format(from_unixtime(1612345678), '%Y-%m-%d') as date;
    date
------------
 2021-02-03

presto:default> SELECT date_format(from_unixtime(1612345678), '%Y-%m-%d %H:%i:%s') as datetime;
      datetime
---------------------
 2021-02-03 17:47:58
1
2
3
4
5
6
7
8
9

# 三、Tips

# 3.1 防止因为和hive不兼容跑数的时候报错

一般都是meta错误

set session hive.collect_column_statistics_on_write = false;
1
更新时间: 6/30/2023, 4:08:06 PM