From cf80a839f8ea487cb2030f88cfcf685ab0e92ab8 Mon Sep 17 00:00:00 2001 From: Tomohiro Tanaka Date: Thu, 16 Nov 2023 20:36:09 +0900 Subject: [PATCH 1/2] Docs: Nit: Fix parquet default codec --- docs/configuration.md | 90 +++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index e364b9c939f4..2da8ac082314 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -47,51 +47,51 @@ Iceberg tables support table properties to configure table behavior, like the de ### Write properties -| Property | Default | Description | -|------------------------------------------------------|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| write.format.default | parquet | Default file format for the table; parquet, avro, or orc | -| write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | -| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | -| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | -| write.parquet.page-row-limit | 20000 | Parquet page row limit | -| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | -| write.parquet.compression-codec | gzip | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | -| write.parquet.compression-level | null | Parquet compression level | -| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | -| write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | -| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | -| write.avro.compression-level | null | Avro compression level | -| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes | -| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files | -| write.orc.compression-codec | zlib | ORC compression codec: zstd, lz4, lzo, zlib, snappy, none | -| write.orc.compression-strategy | speed | ORC compression strategy: speed, compression | -| write.orc.bloom.filter.columns | (not set) | Comma separated list of column names for which a Bloom filter must be created | -| write.orc.bloom.filter.fpp | 0.05 | False positive probability for Bloom filter (must > 0.0 and < 1.0) | -| write.location-provider.impl | null | Optional custom implementation for LocationProvider | -| write.metadata.compression-codec | none | Metadata compression codec; none or gzip | -| write.metadata.metrics.max-inferred-column-defaults | 100 | Defines the maximum number of columns for which metrics are collected | -| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full | -| write.metadata.metrics.column.col1 | (not set) | Metrics mode for column 'col1' to allow per-column tuning; none, counts, truncate(length), or full | -| write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | -| write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | -| write.distribution-mode | none | Defines distribution of write data: __none__: don't shuffle rows; __hash__: hash distribute by partition key ; __range__: range distribute by partition key or sort key if table has an SortOrder | -| write.delete.distribution-mode | hash | Defines distribution of write delete data | -| write.update.distribution-mode | hash | Defines distribution of write update data | -| write.merge.distribution-mode | none | Defines distribution of write merge data | -| write.wap.enabled | false | Enables write-audit-publish writes | -| write.summary.partition-limit | 0 | Includes partition-level summary stats in snapshot summaries if the changed partition count is less than this limit | -| write.metadata.delete-after-commit.enabled | false | Controls whether to delete the oldest **tracked** version metadata files after commit | -| write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to keep before deleting after commit | -| write.spark.fanout.enabled | false | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory | -| write.object-storage.enabled | false | Enables the object storage location provider that adds a hash component to file paths | -| write.data.path | table location + /data | Base location for data files | -| write.metadata.path | table location + /metadata | Base location for metadata files | -| write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 only) | -| write.delete.isolation-level | serializable | Isolation level for delete commands: serializable or snapshot | -| write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 only) | -| write.update.isolation-level | serializable | Isolation level for update commands: serializable or snapshot | -| write.merge.mode | copy-on-write | Mode used for merge commands: copy-on-write or merge-on-read (v2 only) | -| write.merge.isolation-level | serializable | Isolation level for merge commands: serializable or snapshot | +| Property | Default | Description | +|------------------------------------------------------|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| write.format.default | parquet | Default file format for the table; parquet, avro, or orc | +| write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | +| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | +| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | +| write.parquet.page-row-limit | 20000 | Parquet page row limit | +| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | +| write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | +| write.parquet.compression-level | null | Parquet compression level | +| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | +| write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | +| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | +| write.avro.compression-level | null | Avro compression level | +| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes | +| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files | +| write.orc.compression-codec | zlib | ORC compression codec: zstd, lz4, lzo, zlib, snappy, none | +| write.orc.compression-strategy | speed | ORC compression strategy: speed, compression | +| write.orc.bloom.filter.columns | (not set) | Comma separated list of column names for which a Bloom filter must be created | +| write.orc.bloom.filter.fpp | 0.05 | False positive probability for Bloom filter (must > 0.0 and < 1.0) | +| write.location-provider.impl | null | Optional custom implementation for LocationProvider | +| write.metadata.compression-codec | none | Metadata compression codec; none or gzip | +| write.metadata.metrics.max-inferred-column-defaults | 100 | Defines the maximum number of columns for which metrics are collected | +| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full | +| write.metadata.metrics.column.col1 | (not set) | Metrics mode for column 'col1' to allow per-column tuning; none, counts, truncate(length), or full | +| write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | +| write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | +| write.distribution-mode | none | Defines distribution of write data: __none__: don't shuffle rows; __hash__: hash distribute by partition key ; __range__: range distribute by partition key or sort key if table has an SortOrder | +| write.delete.distribution-mode | hash | Defines distribution of write delete data | +| write.update.distribution-mode | hash | Defines distribution of write update data | +| write.merge.distribution-mode | none | Defines distribution of write merge data | +| write.wap.enabled | false | Enables write-audit-publish writes | +| write.summary.partition-limit | 0 | Includes partition-level summary stats in snapshot summaries if the changed partition count is less than this limit | +| write.metadata.delete-after-commit.enabled | false | Controls whether to delete the oldest **tracked** version metadata files after commit | +| write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to keep before deleting after commit | +| write.spark.fanout.enabled | false | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory | +| write.object-storage.enabled | false | Enables the object storage location provider that adds a hash component to file paths | +| write.data.path | table location + /data | Base location for data files | +| write.metadata.path | table location + /metadata | Base location for metadata files | +| write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 only) | +| write.delete.isolation-level | serializable | Isolation level for delete commands: serializable or snapshot | +| write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 only) | +| write.update.isolation-level | serializable | Isolation level for update commands: serializable or snapshot | +| write.merge.mode | copy-on-write | Mode used for merge commands: copy-on-write or merge-on-read (v2 only) | +| write.merge.isolation-level | serializable | Isolation level for merge commands: serializable or snapshot | ### Table behavior properties From d916ecd1fcc2bdc50c262a9a24ba70657c6b29c2 Mon Sep 17 00:00:00 2001 From: Tomohiro Tanaka Date: Fri, 17 Nov 2023 00:19:30 +0900 Subject: [PATCH 2/2] Remove unnecessary spaces --- docs/configuration.md | 90 +++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 2da8ac082314..ead5c763ab2b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -47,51 +47,51 @@ Iceberg tables support table properties to configure table behavior, like the de ### Write properties -| Property | Default | Description | -|------------------------------------------------------|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| write.format.default | parquet | Default file format for the table; parquet, avro, or orc | -| write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | -| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | -| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | -| write.parquet.page-row-limit | 20000 | Parquet page row limit | -| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | -| write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | -| write.parquet.compression-level | null | Parquet compression level | -| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | -| write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | -| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | -| write.avro.compression-level | null | Avro compression level | -| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes | -| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files | -| write.orc.compression-codec | zlib | ORC compression codec: zstd, lz4, lzo, zlib, snappy, none | -| write.orc.compression-strategy | speed | ORC compression strategy: speed, compression | -| write.orc.bloom.filter.columns | (not set) | Comma separated list of column names for which a Bloom filter must be created | -| write.orc.bloom.filter.fpp | 0.05 | False positive probability for Bloom filter (must > 0.0 and < 1.0) | -| write.location-provider.impl | null | Optional custom implementation for LocationProvider | -| write.metadata.compression-codec | none | Metadata compression codec; none or gzip | -| write.metadata.metrics.max-inferred-column-defaults | 100 | Defines the maximum number of columns for which metrics are collected | -| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full | -| write.metadata.metrics.column.col1 | (not set) | Metrics mode for column 'col1' to allow per-column tuning; none, counts, truncate(length), or full | -| write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | -| write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | -| write.distribution-mode | none | Defines distribution of write data: __none__: don't shuffle rows; __hash__: hash distribute by partition key ; __range__: range distribute by partition key or sort key if table has an SortOrder | -| write.delete.distribution-mode | hash | Defines distribution of write delete data | -| write.update.distribution-mode | hash | Defines distribution of write update data | -| write.merge.distribution-mode | none | Defines distribution of write merge data | -| write.wap.enabled | false | Enables write-audit-publish writes | -| write.summary.partition-limit | 0 | Includes partition-level summary stats in snapshot summaries if the changed partition count is less than this limit | -| write.metadata.delete-after-commit.enabled | false | Controls whether to delete the oldest **tracked** version metadata files after commit | -| write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to keep before deleting after commit | -| write.spark.fanout.enabled | false | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory | -| write.object-storage.enabled | false | Enables the object storage location provider that adds a hash component to file paths | -| write.data.path | table location + /data | Base location for data files | -| write.metadata.path | table location + /metadata | Base location for metadata files | -| write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 only) | -| write.delete.isolation-level | serializable | Isolation level for delete commands: serializable or snapshot | -| write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 only) | -| write.update.isolation-level | serializable | Isolation level for update commands: serializable or snapshot | -| write.merge.mode | copy-on-write | Mode used for merge commands: copy-on-write or merge-on-read (v2 only) | -| write.merge.isolation-level | serializable | Isolation level for merge commands: serializable or snapshot | +| Property | Default | Description | +|------------------------------------------------------|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| write.format.default | parquet | Default file format for the table; parquet, avro, or orc | +| write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | +| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | +| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | +| write.parquet.page-row-limit | 20000 | Parquet page row limit | +| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | +| write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | +| write.parquet.compression-level | null | Parquet compression level | +| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 | +| write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | +| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | +| write.avro.compression-level | null | Avro compression level | +| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes | +| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files | +| write.orc.compression-codec | zlib | ORC compression codec: zstd, lz4, lzo, zlib, snappy, none | +| write.orc.compression-strategy | speed | ORC compression strategy: speed, compression | +| write.orc.bloom.filter.columns | (not set) | Comma separated list of column names for which a Bloom filter must be created | +| write.orc.bloom.filter.fpp | 0.05 | False positive probability for Bloom filter (must > 0.0 and < 1.0) | +| write.location-provider.impl | null | Optional custom implementation for LocationProvider | +| write.metadata.compression-codec | none | Metadata compression codec; none or gzip | +| write.metadata.metrics.max-inferred-column-defaults | 100 | Defines the maximum number of columns for which metrics are collected | +| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full | +| write.metadata.metrics.column.col1 | (not set) | Metrics mode for column 'col1' to allow per-column tuning; none, counts, truncate(length), or full | +| write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | +| write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | +| write.distribution-mode | none | Defines distribution of write data: __none__: don't shuffle rows; __hash__: hash distribute by partition key ; __range__: range distribute by partition key or sort key if table has an SortOrder | +| write.delete.distribution-mode | hash | Defines distribution of write delete data | +| write.update.distribution-mode | hash | Defines distribution of write update data | +| write.merge.distribution-mode | none | Defines distribution of write merge data | +| write.wap.enabled | false | Enables write-audit-publish writes | +| write.summary.partition-limit | 0 | Includes partition-level summary stats in snapshot summaries if the changed partition count is less than this limit | +| write.metadata.delete-after-commit.enabled | false | Controls whether to delete the oldest **tracked** version metadata files after commit | +| write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to keep before deleting after commit | +| write.spark.fanout.enabled | false | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory | +| write.object-storage.enabled | false | Enables the object storage location provider that adds a hash component to file paths | +| write.data.path | table location + /data | Base location for data files | +| write.metadata.path | table location + /metadata | Base location for metadata files | +| write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 only) | +| write.delete.isolation-level | serializable | Isolation level for delete commands: serializable or snapshot | +| write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 only) | +| write.update.isolation-level | serializable | Isolation level for update commands: serializable or snapshot | +| write.merge.mode | copy-on-write | Mode used for merge commands: copy-on-write or merge-on-read (v2 only) | +| write.merge.isolation-level | serializable | Isolation level for merge commands: serializable or snapshot | ### Table behavior properties