Skip to content

Commit

Permalink
Core: Don't persist counts for paths and positions in position delete…
Browse files Browse the repository at this point in the history
… files (#8590)
  • Loading branch information
aokolnychyi authored Sep 20, 2023
1 parent 0d79372 commit da2ad38
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 9 deletions.
30 changes: 24 additions & 6 deletions core/src/main/java/org/apache/iceberg/MetricsUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,36 @@ public class MetricsUtil {
private MetricsUtil() {}

/**
* Copies a metrics object without lower and upper bounds for given fields.
* Copies a metrics object without value, NULL and NaN counts for given fields.
*
* @param excludedFieldIds field IDs for which the lower and upper bounds must be dropped
* @param excludedFieldIds field IDs for which the counts must be dropped
* @return a new metrics object without counts for given fields
*/
public static Metrics copyWithoutFieldCounts(Metrics metrics, Set<Integer> excludedFieldIds) {
return new Metrics(
metrics.recordCount(),
metrics.columnSizes(),
copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
metrics.lowerBounds(),
metrics.upperBounds());
}

/**
* Copies a metrics object without counts and bounds for given fields.
*
* @param excludedFieldIds field IDs for which the counts and bounds must be dropped
* @return a new metrics object without lower and upper bounds for given fields
*/
public static Metrics copyWithoutFieldBounds(Metrics metrics, Set<Integer> excludedFieldIds) {
public static Metrics copyWithoutFieldCountsAndBounds(
Metrics metrics, Set<Integer> excludedFieldIds) {
return new Metrics(
metrics.recordCount(),
metrics.columnSizes(),
metrics.valueCounts(),
metrics.nullValueCounts(),
metrics.nanValueCounts(),
copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.lowerBounds(), excludedFieldIds),
copyWithoutKeys(metrics.upperBounds(), excludedFieldIds));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
* records, consider using {@link SortingPositionOnlyDeleteWriter} instead.
*/
public class PositionDeleteWriter<T> implements FileWriter<PositionDelete<T>, DeleteWriteResult> {
private static final Set<Integer> SINGLE_REFERENCED_FILE_BOUNDS_ONLY =
private static final Set<Integer> FILE_AND_POS_FIELD_IDS =
ImmutableSet.of(DELETE_FILE_PATH.fieldId(), DELETE_FILE_POS.fieldId());

private final FileAppender<StructLike> appender;
Expand Down Expand Up @@ -121,9 +121,9 @@ public DeleteWriteResult result() {
private Metrics metrics() {
Metrics metrics = appender.metrics();
if (referencedDataFiles.size() > 1) {
return MetricsUtil.copyWithoutFieldBounds(metrics, SINGLE_REFERENCED_FILE_BOUNDS_ONLY);
return MetricsUtil.copyWithoutFieldCountsAndBounds(metrics, FILE_AND_POS_FIELD_IDS);
} else {
return metrics;
return MetricsUtil.copyWithoutFieldCounts(metrics, FILE_AND_POS_FIELD_IDS);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.CharSequenceSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.StructLikeSet;
Expand Down Expand Up @@ -232,14 +233,20 @@ public void testPositionDeleteWriter() throws IOException {
if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.columnSizes());
} else {
Assert.assertEquals(1, referencedDataFiles.size());
Assert.assertEquals(2, deleteFile.lowerBounds().size());
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertEquals(2, deleteFile.upperBounds().size());
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertEquals(2, deleteFile.columnSizes().size());
}

Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());

// verify the written delete file
GenericRecord deleteRecord = GenericRecord.create(DeleteSchemaUtil.pathPosSchema());
List<Record> expectedDeletes =
Expand Down Expand Up @@ -281,6 +288,34 @@ public void testPositionDeleteWriterWithRow() throws IOException {
DeleteFile deleteFile = result.first();
CharSequenceSet referencedDataFiles = result.second();

if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.columnSizes());
Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());
} else {
Assert.assertEquals(1, referencedDataFiles.size());
Assert.assertEquals(4, deleteFile.lowerBounds().size());
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_POS.fieldId()));
for (Types.NestedField column : table.schema().columns()) {
Assert.assertTrue(deleteFile.lowerBounds().containsKey(column.fieldId()));
}
Assert.assertEquals(4, deleteFile.upperBounds().size());
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_POS.fieldId()));
for (Types.NestedField column : table.schema().columns()) {
Assert.assertTrue(deleteFile.upperBounds().containsKey(column.fieldId()));
}
// ORC also contains metrics for the deleted row struct, not just actual data fields
Assert.assertTrue(deleteFile.columnSizes().size() >= 4);
Assert.assertTrue(deleteFile.valueCounts().size() >= 2);
Assert.assertTrue(deleteFile.nullValueCounts().size() >= 2);
Assert.assertNull(deleteFile.nanValueCounts());
}

// verify the written delete file
GenericRecord deletedRow = GenericRecord.create(table.schema());
Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(table.schema());
Expand Down Expand Up @@ -336,6 +371,15 @@ public void testPositionDeleteWriterMultipleDataFiles() throws IOException {
Assert.assertEquals(2, referencedDataFiles.size());
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());

if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.columnSizes());
} else {
Assert.assertEquals(2, deleteFile.columnSizes().size());
}

// commit the data and delete files
table
Expand Down

0 comments on commit da2ad38

Please sign in to comment.