Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core: fix reading of split offsets in manifests #8834

Merged
merged 7 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions core/src/main/java/org/apache/iceberg/BaseFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ public PartitionData copy() {
// cached schema
private transient Schema avroSchema = null;

// lazy variables
private transient volatile List<Long> splitOffsetList = null;

/** Used by Avro reflection to instantiate this class when reading manifest files. */
BaseFile(Schema avroSchema) {
this.avroSchema = avroSchema;
Expand Down Expand Up @@ -463,11 +460,7 @@ public ByteBuffer keyMetadata() {

@Override
public List<Long> splitOffsets() {
if (splitOffsetList == null && splitOffsets != null) {
this.splitOffsetList = ArrayUtil.toUnmodifiableLongList(splitOffsets);
}

return splitOffsetList;
return ArrayUtil.toUnmodifiableLongList(splitOffsets);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤦 Good catch.

}

long[] splitOffsetArray() {
Expand Down
3 changes: 3 additions & 0 deletions core/src/test/java/org/apache/iceberg/TableTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=1") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(1L))
.build();
static final DeleteFile FILE_B_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand All @@ -112,6 +113,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=2") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(2L, 2_000_000L))
.build();
static final DeleteFile FILE_C2_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand All @@ -127,6 +129,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=3") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(3L, 3_000L, 3_000_000L))
.build();
static final DeleteFile FILE_D2_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand Down
19 changes: 13 additions & 6 deletions core/src/test/java/org/apache/iceberg/TestManifestReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.iceberg;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
Expand All @@ -28,6 +30,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.Streams;
import org.apache.iceberg.types.Types;
import org.assertj.core.api.Assertions;
import org.assertj.core.api.recursive.comparison.RecursiveComparisonConfiguration;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Test;
Expand All @@ -41,6 +44,12 @@ public static Object[] parameters() {
return new Object[] {1, 2};
}

private static final RecursiveComparisonConfiguration FILE_COMPARISON_CONFIG =
RecursiveComparisonConfiguration.builder()
.withIgnoredFields(
"dataSequenceNumber", "fileOrdinal", "fileSequenceNumber", "fromProjectionPos")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this will compare split offsets now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it will compare all fields from the input data files, except for these excluded fields which aren't expected to match

.build();

public TestManifestReader(int formatVersion) {
super(formatVersion);
}
Expand All @@ -61,16 +70,14 @@ public void testReaderWithFilterWithoutSelect() throws IOException {
ManifestFile manifest = writeManifest(1000L, FILE_A, FILE_B, FILE_C);
try (ManifestReader<DataFile> reader =
ManifestFiles.read(manifest, FILE_IO).filterRows(Expressions.equal("id", 0))) {
List<String> files =
Streams.stream(reader).map(file -> file.path().toString()).collect(Collectors.toList());
List<DataFile> files = Streams.stream(reader).collect(Collectors.toList());

// note that all files are returned because the reader returns data files that may match, and
// the partition is
// bucketing by data, which doesn't help filter files
Assert.assertEquals(
"Should read the expected files",
Lists.newArrayList(FILE_A.path(), FILE_B.path(), FILE_C.path()),
files);
assertThat(files)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this can be simplified to

    assertThat(files)
          .usingRecursiveComparison()
          .ignoringFields(
              "dataSequenceNumber", "fileOrdinal", "fileSequenceNumber", "fromProjectionPos")
          .isEqualTo(Lists.newArrayList(FILE_A, FILE_B, FILE_C));

that way you don't need the static final field for the comparison config

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought was the config variable could be reused in other tests

.usingRecursiveComparison(FILE_COMPARISON_CONFIG)
.isEqualTo(Lists.newArrayList(FILE_A, FILE_B, FILE_C));
}
}

Expand Down