From de34439f6ca43a62646e26f92e63440e1edcefd7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 31 Jan 2024 20:31:49 +0100 Subject: [PATCH] Add example with rename column In Iceberg the tables are projected using field-IDs. Even if the column is renamed (and Iceberg is lazy, so the table is not rewritten), it should still read the original column. --- .../iceberg/docker-compose/provision.py | 39 +++++++++++++++++-- tests/integration/iceberg/test_table_load.py | 6 ++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/tests/integration/iceberg/docker-compose/provision.py b/tests/integration/iceberg/docker-compose/provision.py index cce6362fdc..5ab42108b6 100644 --- a/tests/integration/iceberg/docker-compose/provision.py +++ b/tests/integration/iceberg/docker-compose/provision.py @@ -325,7 +325,7 @@ spark.sql( """ - CREATE OR REPLACE TABLE default.add_new_column + CREATE OR REPLACE TABLE default.test_add_new_column USING iceberg AS SELECT 1 AS idx @@ -336,5 +336,38 @@ """ ) -spark.sql("ALTER TABLE default.add_new_column ADD COLUMN name STRING") -spark.sql("INSERT INTO default.add_new_column VALUES (3, 'abc'), (4, 'def')") +spark.sql("ALTER TABLE default.test_add_new_column ADD COLUMN name STRING") +spark.sql("INSERT INTO default.test_add_new_column VALUES (3, 'abc'), (4, 'def')") + +# In Iceberg the data and schema evolves independently. We can add a column +# that should show up when querying the data, but is not yet represented in a Parquet file + +spark.sql( + """ + CREATE OR REPLACE TABLE default.test_new_column_with_no_data + USING iceberg + AS SELECT + 1 AS idx + UNION ALL SELECT + 2 AS idx + UNION ALL SELECT + 3 AS idx +""" +) + +spark.sql("ALTER TABLE default.test_new_column_with_no_data ADD COLUMN name STRING") + +spark.sql( + """ + CREATE OR REPLACE TABLE default.test_table_rename + USING iceberg + AS SELECT + 1 AS idx + UNION ALL SELECT + 2 AS idx + UNION ALL SELECT + 3 AS idx +""" +) + +spark.sql("ALTER TABLE default.test_table_rename RENAME COLUMN idx TO pos") diff --git a/tests/integration/iceberg/test_table_load.py b/tests/integration/iceberg/test_table_load.py index 4b7692e3c4..5c3616e67d 100644 --- a/tests/integration/iceberg/test_table_load.py +++ b/tests/integration/iceberg/test_table_load.py @@ -21,7 +21,7 @@ def test_daft_iceberg_table_open(local_iceberg_tables): WORKING_SHOW_COLLECT = [ - # "test_all_types", # ValueError: DaftError::ArrowError Not yet implemented: Deserializing type Decimal(10, 2) from parquet + "test_all_types", # ValueError: DaftError::ArrowError Not yet implemented: Deserializing type Decimal(10, 2) from parquet "test_limit", "test_null_nan", "test_null_nan_rewritten", @@ -37,7 +37,9 @@ def test_daft_iceberg_table_open(local_iceberg_tables): # "test_table_sanitized_character", # Bug in scan().to_arrow().to_arrow() "test_table_version", # we have bugs when loading no files "test_uuid_and_fixed_unpartitioned", - "add_new_column", + "test_add_new_column", + "test_new_column_with_no_data", + "test_table_rename", ]