apache · rdblue · Oct 17, 2023 · Oct 17, 2023 · singhpk234 · Oct 17, 2023
diff --git a/core/src/main/java/org/apache/iceberg/BaseFile.java b/core/src/main/java/org/apache/iceberg/BaseFile.java
@@ -460,6 +460,16 @@ public ByteBuffer keyMetadata() {
 
   @Override
   public List<Long> splitOffsets() {
+    if (splitOffsets == null || splitOffsets.length == 0) {
+      return null;
+    }
+
+    // If the last split offset is past the file size this means the split offsets are corrupted and
+    // should not be used
+    if (splitOffsets[splitOffsets.length - 1] >= fileSizeInBytes) {
+      return null;
+    }
+
     return ArrayUtil.toUnmodifiableLongList(splitOffsets);
   }
 

diff --git a/core/src/test/java/org/apache/iceberg/TableTestBase.java b/core/src/test/java/org/apache/iceberg/TableTestBase.java
@@ -113,7 +113,7 @@ public class TableTestBase {
           .withFileSizeInBytes(10)
           .withPartitionPath("data_bucket=2") // easy way to set partition data for now
           .withRecordCount(1)
-          .withSplitOffsets(ImmutableList.of(2L, 2_000_000L))
+          .withSplitOffsets(ImmutableList.of(2L, 8L))
           .build();
   static final DeleteFile FILE_C2_DELETES =
       FileMetadata.deleteFileBuilder(SPEC)
@@ -129,7 +129,7 @@ public class TableTestBase {
           .withFileSizeInBytes(10)
           .withPartitionPath("data_bucket=3") // easy way to set partition data for now
           .withRecordCount(1)
-          .withSplitOffsets(ImmutableList.of(3L, 3_000L, 3_000_000L))
+          .withSplitOffsets(ImmutableList.of(0L, 3L, 6L))
           .build();
   static final DeleteFile FILE_D2_DELETES =
       FileMetadata.deleteFileBuilder(SPEC)

diff --git a/core/src/test/java/org/apache/iceberg/TestManifestReader.java b/core/src/test/java/org/apache/iceberg/TestManifestReader.java
@@ -25,6 +25,7 @@
 import java.util.stream.Collectors;
 import org.apache.iceberg.ManifestEntry.Status;
 import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
@@ -159,4 +160,20 @@ public void testDeleteFilePositions() throws IOException {
       }
     }
   }
+
+  @Test
+  public void testDataFileSplitOffsetsNullWhenInvalid() throws IOException {
+    DataFile invalidOffset =
+        DataFiles.builder(SPEC)
+            .withPath("/path/to/invalid-offsets.parquet")
+            .withFileSizeInBytes(10)
+            .withRecordCount(1)
+            .withSplitOffsets(ImmutableList.of(2L, 1000L)) // Offset 1000 is out of bounds
+            .build();
+    ManifestFile manifest = writeManifest(1000L, invalidOffset);
+    try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest, FILE_IO)) {
+      DataFile file = Iterables.getOnlyElement(reader);
+      Assertions.assertThat(file.splitOffsets()).isNull();
+    }
+  }
 }