From 1165630b909ac76aa5e391a243eb75da1052a937 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 26 Jul 2023 14:26:40 -0700 Subject: [PATCH 1/9] Change to use zstd by default --- core/src/main/java/org/apache/iceberg/TableProperties.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index b14354def6ac..d9fd8addc042 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -142,12 +142,12 @@ private TableProperties() {} public static final String PARQUET_COMPRESSION = "write.parquet.compression-codec"; public static final String DELETE_PARQUET_COMPRESSION = "write.delete.parquet.compression-codec"; - public static final String PARQUET_COMPRESSION_DEFAULT = "gzip"; + public static final String PARQUET_COMPRESSION_DEFAULT = "zstd"; public static final String PARQUET_COMPRESSION_LEVEL = "write.parquet.compression-level"; public static final String DELETE_PARQUET_COMPRESSION_LEVEL = "write.delete.parquet.compression-level"; - public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; + public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; // For zstd, it is default to "3" public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = "write.parquet.row-group-check-min-record-count"; From 65e8610ef4e98000b318a46600aad541295f25d0 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 26 Jul 2023 14:41:39 -0700 Subject: [PATCH 2/9] update doc --- docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 7fa2d94adf91..6b4d8c8c00e4 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -55,8 +55,8 @@ Iceberg tables support table properties to configure table behavior, like the de | write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | | write.parquet.page-row-limit | 20000 | Parquet page row limit | | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | -| write.parquet.compression-codec | gzip | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | -| write.parquet.compression-level | null | Parquet compression level | +| write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | +| write.parquet.compression-level | null | Parquet compression level (zstd internally uses 3 as default if not set) | | write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | | write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | | write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | From 2b88a7b0e338bd4a785690791b435792d3e4f447 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 26 Jul 2023 15:13:28 -0700 Subject: [PATCH 3/9] update --- .palantir/revapi.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index 5ac91ec0a96f..86cdf47bd4d0 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -774,6 +774,11 @@ acceptedBreaks: - code: "java.method.removed" old: "method org.apache.iceberg.view.ViewBuilder org.apache.iceberg.view.ViewBuilder::withQueryColumnNames(java.util.List)" justification: "Acceptable break due to updating View APIs and the View Spec" + org.apache.iceberg:iceberg-core: + - code: "java.field.constantValueChanged" + old: "field org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT" + new: "field org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT" + justification: "{Changing the default compression codec from gzip to zstd}" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged" From 844e78e64e25c1d1ddc6c1b9a2430b9a9cc9f8d1 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 26 Jul 2023 15:52:58 -0700 Subject: [PATCH 4/9] style --- core/src/main/java/org/apache/iceberg/TableProperties.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index d9fd8addc042..962bc851de87 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -147,7 +147,8 @@ private TableProperties() {} public static final String PARQUET_COMPRESSION_LEVEL = "write.parquet.compression-level"; public static final String DELETE_PARQUET_COMPRESSION_LEVEL = "write.delete.parquet.compression-level"; - public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; // For zstd, it is default to "3" + public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = + null; // For zstd, it is default to "3" public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = "write.parquet.row-group-check-min-record-count"; From f2309ad261923f536b823cfde438f8be460675c6 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Thu, 27 Jul 2023 10:06:47 -0700 Subject: [PATCH 5/9] Fix test failures --- .../iceberg/flink/source/TestMetadataTableReadableMetrics.java | 2 +- .../iceberg/flink/source/TestMetadataTableReadableMetrics.java | 2 +- .../iceberg/flink/source/TestMetadataTableReadableMetrics.java | 2 +- .../iceberg/spark/source/TestMetadataTableReadableMetrics.java | 2 +- .../iceberg/spark/source/TestMetadataTableReadableMetrics.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index cb6fda18a1ee..87402787276f 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -291,7 +291,7 @@ public void testSelectNestedValues() throws Exception { public void testNestedValues() throws Exception { createNestedTable(); - Row leafDoubleCol = Row.of(53L, 3L, 1L, 1L, 0.0D, 0.0D); + Row leafDoubleCol = Row.of(46L, 3L, 1L, 1L, 0.0D, 0.0D); Row leafLongCol = Row.of(54L, 3L, 1L, null, 0L, 1L); Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); diff --git a/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index cb6fda18a1ee..87402787276f 100644 --- a/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -291,7 +291,7 @@ public void testSelectNestedValues() throws Exception { public void testNestedValues() throws Exception { createNestedTable(); - Row leafDoubleCol = Row.of(53L, 3L, 1L, 1L, 0.0D, 0.0D); + Row leafDoubleCol = Row.of(46L, 3L, 1L, 1L, 0.0D, 0.0D); Row leafLongCol = Row.of(54L, 3L, 1L, null, 0L, 1L); Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index cb6fda18a1ee..87402787276f 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -291,7 +291,7 @@ public void testSelectNestedValues() throws Exception { public void testNestedValues() throws Exception { createNestedTable(); - Row leafDoubleCol = Row.of(53L, 3L, 1L, 1L, 0.0D, 0.0D); + Row leafDoubleCol = Row.of(46L, 3L, 1L, 1L, 0.0D, 0.0D); Row leafLongCol = Row.of(54L, 3L, 1L, null, 0L, 1L); Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java index 416d5eed5b65..5285158ec99d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java @@ -326,7 +326,7 @@ public void testSelectNestedValues() throws Exception { public void testNestedValues() throws Exception { createNestedTable(); - Object[] leafDoubleCol = row(53L, 3L, 1L, 1L, 0.0D, 0.0D); + Object[] leafDoubleCol = row(46L, 3L, 1L, 1L, 0.0D, 0.0D); Object[] leafLongCol = row(54L, 3L, 1L, null, 0L, 1L); Object[] metrics = row(leafDoubleCol, leafLongCol); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java index f65da4574284..9fb754776818 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTableReadableMetrics.java @@ -353,7 +353,7 @@ public void testSelectNestedValues() throws Exception { public void testNestedValues() throws Exception { createNestedTable(); - Object[] leafDoubleCol = row(53L, 3L, 1L, 1L, 0.0D, 0.0D); + Object[] leafDoubleCol = row(46L, 3L, 1L, 1L, 0.0D, 0.0D); Object[] leafLongCol = row(54L, 3L, 1L, null, 0L, 1L); Object[] metrics = row(leafDoubleCol, leafLongCol); From f18498463aded08dd38d656e02bdbc99de76ef29 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Thu, 27 Jul 2023 13:05:48 -0700 Subject: [PATCH 6/9] change data size in metric since the compression codec is changed --- .../TestMetadataTableReadableMetrics.java | 18 +++++++++--------- .../TestMetadataTableReadableMetrics.java | 18 +++++++++--------- .../TestMetadataTableReadableMetrics.java | 18 +++++++++--------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index 87402787276f..f05bf2fcd9e5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -219,27 +219,27 @@ public void testPrimitiveColumns() throws Exception { Row binaryCol = Row.of( - 59L, + 52L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row booleanCol = Row.of(44L, 4L, 0L, null, false, true); - Row decimalCol = Row.of(97L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); - Row doubleCol = Row.of(99L, 4L, 0L, 1L, 1.0D, 2.0D); + Row booleanCol = Row.of(32L, 4L, 0L, null, false, true); + Row decimalCol = Row.of(85L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); + Row doubleCol = Row.of(85L, 4L, 0L, 1L, 1.0D, 2.0D); Row fixedCol = Row.of( - 55L, + 44L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row floatCol = Row.of(90L, 4L, 0L, 2L, 0f, 0f); - Row intCol = Row.of(91L, 4L, 0L, null, 1, 2); - Row longCol = Row.of(91L, 4L, 0L, null, 1L, 2L); - Row stringCol = Row.of(99L, 4L, 0L, null, "1", "2"); + Row floatCol = Row.of(71L, 4L, 0L, 2L, 0f, 0f); + Row intCol = Row.of(71L, 4L, 0L, null, 1, 2); + Row longCol = Row.of(79L, 4L, 0L, null, 1L, 2L); + Row stringCol = Row.of(79L, 4L, 0L, null, "1", "2"); List expected = Lists.newArrayList( diff --git a/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index 87402787276f..f05bf2fcd9e5 100644 --- a/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -219,27 +219,27 @@ public void testPrimitiveColumns() throws Exception { Row binaryCol = Row.of( - 59L, + 52L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row booleanCol = Row.of(44L, 4L, 0L, null, false, true); - Row decimalCol = Row.of(97L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); - Row doubleCol = Row.of(99L, 4L, 0L, 1L, 1.0D, 2.0D); + Row booleanCol = Row.of(32L, 4L, 0L, null, false, true); + Row decimalCol = Row.of(85L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); + Row doubleCol = Row.of(85L, 4L, 0L, 1L, 1.0D, 2.0D); Row fixedCol = Row.of( - 55L, + 44L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row floatCol = Row.of(90L, 4L, 0L, 2L, 0f, 0f); - Row intCol = Row.of(91L, 4L, 0L, null, 1, 2); - Row longCol = Row.of(91L, 4L, 0L, null, 1L, 2L); - Row stringCol = Row.of(99L, 4L, 0L, null, "1", "2"); + Row floatCol = Row.of(71L, 4L, 0L, 2L, 0f, 0f); + Row intCol = Row.of(71L, 4L, 0L, null, 1, 2); + Row longCol = Row.of(79L, 4L, 0L, null, 1L, 2L); + Row stringCol = Row.of(79L, 4L, 0L, null, "1", "2"); List expected = Lists.newArrayList( diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java index 87402787276f..f05bf2fcd9e5 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -219,27 +219,27 @@ public void testPrimitiveColumns() throws Exception { Row binaryCol = Row.of( - 59L, + 52L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row booleanCol = Row.of(44L, 4L, 0L, null, false, true); - Row decimalCol = Row.of(97L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); - Row doubleCol = Row.of(99L, 4L, 0L, 1L, 1.0D, 2.0D); + Row booleanCol = Row.of(32L, 4L, 0L, null, false, true); + Row decimalCol = Row.of(85L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); + Row doubleCol = Row.of(85L, 4L, 0L, 1L, 1.0D, 2.0D); Row fixedCol = Row.of( - 55L, + 44L, 4L, 2L, null, Base64.getDecoder().decode("1111"), Base64.getDecoder().decode("2222")); - Row floatCol = Row.of(90L, 4L, 0L, 2L, 0f, 0f); - Row intCol = Row.of(91L, 4L, 0L, null, 1, 2); - Row longCol = Row.of(91L, 4L, 0L, null, 1L, 2L); - Row stringCol = Row.of(99L, 4L, 0L, null, "1", "2"); + Row floatCol = Row.of(71L, 4L, 0L, 2L, 0f, 0f); + Row intCol = Row.of(71L, 4L, 0L, null, 1, 2); + Row longCol = Row.of(79L, 4L, 0L, null, 1L, 2L); + Row stringCol = Row.of(79L, 4L, 0L, null, "1", "2"); List expected = Lists.newArrayList( From fbacf2c5870b8b1d53128340605c859f2b437ee3 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Tue, 1 Aug 2023 10:33:34 -0700 Subject: [PATCH 7/9] addressed feedback --- core/src/main/java/org/apache/iceberg/TableProperties.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 962bc851de87..23512a486ef8 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -147,8 +147,7 @@ private TableProperties() {} public static final String PARQUET_COMPRESSION_LEVEL = "write.parquet.compression-level"; public static final String DELETE_PARQUET_COMPRESSION_LEVEL = "write.delete.parquet.compression-level"; - public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = - null; // For zstd, it is default to "3" + public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = "write.parquet.row-group-check-min-record-count"; From df93fc5a1fab65a5f7583ae9b75babad7e13af61 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Tue, 1 Aug 2023 10:37:31 -0700 Subject: [PATCH 8/9] addressed feedback --- docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 6b4d8c8c00e4..60e026760f6a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -56,7 +56,7 @@ Iceberg tables support table properties to configure table behavior, like the de | write.parquet.page-row-limit | 20000 | Parquet page row limit | | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | | write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | -| write.parquet.compression-level | null | Parquet compression level (zstd internally uses 3 as default if not set) | +| write.parquet.compression-level | null | Parquet compression level | | write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | | write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | | write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | From bda3c0354bfd56acf1cbe943e78d1e52b036898b Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Thu, 3 Aug 2023 16:22:13 -0700 Subject: [PATCH 9/9] address feedback --- .palantir/revapi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index 86cdf47bd4d0..332b6234d48f 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -778,7 +778,7 @@ acceptedBreaks: - code: "java.field.constantValueChanged" old: "field org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT" new: "field org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT" - justification: "{Changing the default compression codec from gzip to zstd}" + justification: "Changing the default compression codec from gzip to zstd" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged"