From ad602a379584512d1d96eda557c20cf2af21d1b2 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Tue, 17 Oct 2023 12:13:57 -0700 Subject: [PATCH] Core: Ignore split offsets when the last split offset is past the file length (#8860) --- .../main/java/org/apache/iceberg/BaseFile.java | 10 ++++++++++ .../java/org/apache/iceberg/TableTestBase.java | 4 ++-- .../org/apache/iceberg/TestManifestReader.java | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/BaseFile.java b/core/src/main/java/org/apache/iceberg/BaseFile.java index 793301e55744..31e91457ad40 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFile.java +++ b/core/src/main/java/org/apache/iceberg/BaseFile.java @@ -460,6 +460,16 @@ public ByteBuffer keyMetadata() { @Override public List splitOffsets() { + if (splitOffsets == null || splitOffsets.length == 0) { + return null; + } + + // If the last split offset is past the file size this means the split offsets are corrupted and + // should not be used + if (splitOffsets[splitOffsets.length - 1] >= fileSizeInBytes) { + return null; + } + return ArrayUtil.toUnmodifiableLongList(splitOffsets); } diff --git a/core/src/test/java/org/apache/iceberg/TableTestBase.java b/core/src/test/java/org/apache/iceberg/TableTestBase.java index 3922df8eb2ba..d50845933eb7 100644 --- a/core/src/test/java/org/apache/iceberg/TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/TableTestBase.java @@ -113,7 +113,7 @@ public class TableTestBase { .withFileSizeInBytes(10) .withPartitionPath("data_bucket=2") // easy way to set partition data for now .withRecordCount(1) - .withSplitOffsets(ImmutableList.of(2L, 2_000_000L)) + .withSplitOffsets(ImmutableList.of(2L, 8L)) .build(); static final DeleteFile FILE_C2_DELETES = FileMetadata.deleteFileBuilder(SPEC) @@ -129,7 +129,7 @@ public class TableTestBase { .withFileSizeInBytes(10) .withPartitionPath("data_bucket=3") // easy way to set partition data for now .withRecordCount(1) - .withSplitOffsets(ImmutableList.of(3L, 3_000L, 3_000_000L)) + .withSplitOffsets(ImmutableList.of(0L, 3L, 6L)) .build(); static final DeleteFile FILE_D2_DELETES = FileMetadata.deleteFileBuilder(SPEC) diff --git a/core/src/test/java/org/apache/iceberg/TestManifestReader.java b/core/src/test/java/org/apache/iceberg/TestManifestReader.java index f1674af6c22a..3bb9bf1a4c1d 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestReader.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestReader.java @@ -25,6 +25,7 @@ import java.util.stream.Collectors; import org.apache.iceberg.ManifestEntry.Status; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Streams; @@ -159,4 +160,20 @@ public void testDeleteFilePositions() throws IOException { } } } + + @Test + public void testDataFileSplitOffsetsNullWhenInvalid() throws IOException { + DataFile invalidOffset = + DataFiles.builder(SPEC) + .withPath("/path/to/invalid-offsets.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .withSplitOffsets(ImmutableList.of(2L, 1000L)) // Offset 1000 is out of bounds + .build(); + ManifestFile manifest = writeManifest(1000L, invalidOffset); + try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO)) { + DataFile file = Iterables.getOnlyElement(reader); + Assertions.assertThat(file.splitOffsets()).isNull(); + } + } }