首先,如果您还没有 Docker 和 Docker Compose ,请先安装它们。接下来,创建一个包含以下内容的 docker-compose.yaml 文件。
version: "3"
services:
spark-iceberg:
image: tabulario/spark-iceberg
depends_on:
- postgres
container_name: spark-iceberg
environment:
- SPARK_HOME=/opt/spark
- PYSPARK_PYTON=/usr/bin/python3.9
- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin
volumes:
- ./warehouse:/home/iceberg/warehouse
- ./notebooks:/home/iceberg/notebooks/notebooks
ports:
- 8888:8888
- 8080:8080
- 18080:18080
postgres:
image: postgres:13.4-bullseye
container_name: postgres
environment:
- POSTGRES_USER=admin
- POSTGRES_PASSWORD=password
- POSTGRES_DB=demo_catalog
volumes:
- ./postgres/data:/var/lib/postgresql/data
docker-compose up -d
docker exec -it spark-iceberg pyspark-notebook
一个运行在 http://localhost:8888 的功能齐全的 notebook 服务
一个运行在 http://localhost:8080 的 Spark driver
一个运行在 http://localhost:18080 的 Spark history
docker compose 文件提供的运行环境远不是一个大规模的生产级仓库,但它确实可以让你演示 Iceberg 的诸多特性。让我们快速介绍一下这个最小化的运行环境:
ALTER TABLE taxis
RENAME COLUMN fare_amount TO fare
ALTER TABLE taxis
ADD COLUMN fare_per_distance_unit float AFTER distance;
ALTER TABLE taxis RENAME COLUMN trip_distance TO distance;
ALTER TABLE taxis ALTER COLUMN distance COMMENT 'The elapsed trip distance in miles reported by the taximeter.'
ALTER TABLE taxis ALTER COLUMN distance TYPE double;
ALTER TABLE taxis ALTER COLUMN distance AFTER fare;
ALTER TABLE taxis
ADD PARTITION FIELD VendorID
spark.read.table("taxis").count() # 2,853,020
val ONE_DAY_MS = 86400000;
val NOW = System.currentTimeMillis()
(spark
.read
.option("as-of-timestamp", NOW_MS - ONE_DAY_MS)
.table("taxis")
.count()) # 2,798,371
CALL catalog_name.system.rollback_to_timestamp('taxis', TIMESTAMP '2021-12-31 00:00:00.000')
CALL demo.system.rollback_to_snapshot('taxis', <SNAPSHOT>)
DELETE FROM taxis
WHERE fare_per_distance_unit > 4.0 OR distance > 2.0
MERGE INTO prod.nyctaxis pt
USING (SELECT * FROM staging.nyc.taxis) st
ON pt.id = st.id
WHEN NOT MATCHED THEN INSERT *;
[CREATE|REPLACE] TABLE prod.nyc.vendor2 AS
SELECT * FROM taxis
WHERE vendor_id = '2'
来源 | https://tabular.io/blog/docker-spark-and-iceberg
作者 | Sam Redai & Kyle Bendickson
翻译 | liliwei

