From fb5a22cb5cb6ea1ceacd89d7fb81bec85b586c37 Mon Sep 17 00:00:00 2001 From: Pavan Lanka Date: Tue, 5 Oct 2021 10:13:53 -0700 Subject: [PATCH] Fixed #4 - Obtain ORC stripe offsets from writer instead of opening the written file for the offsets --- .../java/org/apache/iceberg/orc/OrcFileAppender.java | 7 +++---- .../org/apache/iceberg/spark/data/TestOrcWrite.java | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java index f407bdcf436c..a72a959ce1ec 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java @@ -37,7 +37,6 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.orc.OrcFile; -import org.apache.orc.Reader; import org.apache.orc.StripeInformation; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; @@ -147,11 +146,11 @@ public long length() { @Override public List splitOffsets() { Preconditions.checkState(isClosed, "File is not yet closed"); - try (Reader reader = ORC.newFileReader(file.toInputFile(), conf)) { - List stripes = reader.getStripes(); + try { + List stripes = writer.getStripes(); return Collections.unmodifiableList(Lists.transform(stripes, StripeInformation::getOffset)); } catch (IOException e) { - throw new RuntimeIOException(e, "Can't close ORC reader %s", file.location()); + throw new RuntimeIOException(e, "Cannot receive stripe information from writer for %s", file.location()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 1e51a088390e..b9cbedb3de9e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -22,11 +22,17 @@ import java.io.File; import java.io.IOException; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.iceberg.Files; import org.apache.iceberg.Schema; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.types.Types; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.StripeInformation; import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; import org.junit.Rule; @@ -55,5 +61,10 @@ public void splitOffsets() throws IOException { writer.addAll(rows); writer.close(); Assert.assertNotNull("Split offsets not present", writer.splitOffsets()); + // writer offsets are the same as the ORC reader offsets + Reader reader = OrcFile.createReader(new Path(testFile.toURI()), OrcFile.readerOptions(new Configuration())); + Assert.assertEquals(reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()), + writer.splitOffsets()); + reader.close(); } }