From da2ad389fd9ba8222f6fb3f57922209c239a7045 Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Wed, 20 Sep 2023 14:30:31 -0700 Subject: [PATCH] Core: Don't persist counts for paths and positions in position delete files (#8590) --- .../java/org/apache/iceberg/MetricsUtil.java | 30 ++++++++++--- .../iceberg/deletes/PositionDeleteWriter.java | 6 +-- .../iceberg/io/TestFileWriterFactory.java | 44 +++++++++++++++++++ 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java b/core/src/main/java/org/apache/iceberg/MetricsUtil.java index 2cd001b5c46f..2d23121bb0f9 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java +++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java @@ -41,18 +41,36 @@ public class MetricsUtil { private MetricsUtil() {} /** - * Copies a metrics object without lower and upper bounds for given fields. + * Copies a metrics object without value, NULL and NaN counts for given fields. * - * @param excludedFieldIds field IDs for which the lower and upper bounds must be dropped + * @param excludedFieldIds field IDs for which the counts must be dropped + * @return a new metrics object without counts for given fields + */ + public static Metrics copyWithoutFieldCounts(Metrics metrics, Set excludedFieldIds) { + return new Metrics( + metrics.recordCount(), + metrics.columnSizes(), + copyWithoutKeys(metrics.valueCounts(), excludedFieldIds), + copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds), + copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds), + metrics.lowerBounds(), + metrics.upperBounds()); + } + + /** + * Copies a metrics object without counts and bounds for given fields. + * + * @param excludedFieldIds field IDs for which the counts and bounds must be dropped * @return a new metrics object without lower and upper bounds for given fields */ - public static Metrics copyWithoutFieldBounds(Metrics metrics, Set excludedFieldIds) { + public static Metrics copyWithoutFieldCountsAndBounds( + Metrics metrics, Set excludedFieldIds) { return new Metrics( metrics.recordCount(), metrics.columnSizes(), - metrics.valueCounts(), - metrics.nullValueCounts(), - metrics.nanValueCounts(), + copyWithoutKeys(metrics.valueCounts(), excludedFieldIds), + copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds), + copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds), copyWithoutKeys(metrics.lowerBounds(), excludedFieldIds), copyWithoutKeys(metrics.upperBounds(), excludedFieldIds)); } diff --git a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java index 4f799b434993..c8193755f5ba 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java @@ -47,7 +47,7 @@ * records, consider using {@link SortingPositionOnlyDeleteWriter} instead. */ public class PositionDeleteWriter implements FileWriter, DeleteWriteResult> { - private static final Set SINGLE_REFERENCED_FILE_BOUNDS_ONLY = + private static final Set FILE_AND_POS_FIELD_IDS = ImmutableSet.of(DELETE_FILE_PATH.fieldId(), DELETE_FILE_POS.fieldId()); private final FileAppender appender; @@ -121,9 +121,9 @@ public DeleteWriteResult result() { private Metrics metrics() { Metrics metrics = appender.metrics(); if (referencedDataFiles.size() > 1) { - return MetricsUtil.copyWithoutFieldBounds(metrics, SINGLE_REFERENCED_FILE_BOUNDS_ONLY); + return MetricsUtil.copyWithoutFieldCountsAndBounds(metrics, FILE_AND_POS_FIELD_IDS); } else { - return metrics; + return MetricsUtil.copyWithoutFieldCounts(metrics, FILE_AND_POS_FIELD_IDS); } } } diff --git a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java index 7910c666b45d..e25a179edbfc 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java +++ b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java @@ -46,6 +46,7 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.CharSequenceSet; import org.apache.iceberg.util.Pair; import org.apache.iceberg.util.StructLikeSet; @@ -232,14 +233,20 @@ public void testPositionDeleteWriter() throws IOException { if (fileFormat == FileFormat.AVRO) { Assert.assertNull(deleteFile.lowerBounds()); Assert.assertNull(deleteFile.upperBounds()); + Assert.assertNull(deleteFile.columnSizes()); } else { Assert.assertEquals(1, referencedDataFiles.size()); Assert.assertEquals(2, deleteFile.lowerBounds().size()); Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId())); Assert.assertEquals(2, deleteFile.upperBounds().size()); Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId())); + Assert.assertEquals(2, deleteFile.columnSizes().size()); } + Assert.assertNull(deleteFile.valueCounts()); + Assert.assertNull(deleteFile.nullValueCounts()); + Assert.assertNull(deleteFile.nanValueCounts()); + // verify the written delete file GenericRecord deleteRecord = GenericRecord.create(DeleteSchemaUtil.pathPosSchema()); List expectedDeletes = @@ -281,6 +288,34 @@ public void testPositionDeleteWriterWithRow() throws IOException { DeleteFile deleteFile = result.first(); CharSequenceSet referencedDataFiles = result.second(); + if (fileFormat == FileFormat.AVRO) { + Assert.assertNull(deleteFile.lowerBounds()); + Assert.assertNull(deleteFile.upperBounds()); + Assert.assertNull(deleteFile.columnSizes()); + Assert.assertNull(deleteFile.valueCounts()); + Assert.assertNull(deleteFile.nullValueCounts()); + Assert.assertNull(deleteFile.nanValueCounts()); + } else { + Assert.assertEquals(1, referencedDataFiles.size()); + Assert.assertEquals(4, deleteFile.lowerBounds().size()); + Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId())); + Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_POS.fieldId())); + for (Types.NestedField column : table.schema().columns()) { + Assert.assertTrue(deleteFile.lowerBounds().containsKey(column.fieldId())); + } + Assert.assertEquals(4, deleteFile.upperBounds().size()); + Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId())); + Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_POS.fieldId())); + for (Types.NestedField column : table.schema().columns()) { + Assert.assertTrue(deleteFile.upperBounds().containsKey(column.fieldId())); + } + // ORC also contains metrics for the deleted row struct, not just actual data fields + Assert.assertTrue(deleteFile.columnSizes().size() >= 4); + Assert.assertTrue(deleteFile.valueCounts().size() >= 2); + Assert.assertTrue(deleteFile.nullValueCounts().size() >= 2); + Assert.assertNull(deleteFile.nanValueCounts()); + } + // verify the written delete file GenericRecord deletedRow = GenericRecord.create(table.schema()); Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(table.schema()); @@ -336,6 +371,15 @@ public void testPositionDeleteWriterMultipleDataFiles() throws IOException { Assert.assertEquals(2, referencedDataFiles.size()); Assert.assertNull(deleteFile.lowerBounds()); Assert.assertNull(deleteFile.upperBounds()); + Assert.assertNull(deleteFile.valueCounts()); + Assert.assertNull(deleteFile.nullValueCounts()); + Assert.assertNull(deleteFile.nanValueCounts()); + + if (fileFormat == FileFormat.AVRO) { + Assert.assertNull(deleteFile.columnSizes()); + } else { + Assert.assertEquals(2, deleteFile.columnSizes().size()); + } // commit the data and delete files table