Skip to content

Commit

Permalink
Flink: Read parquet BINARY column as String for expected
Browse files Browse the repository at this point in the history
  • Loading branch information
fengjiajie committed Oct 13, 2023
1 parent 6530a3e commit c446df6
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,11 @@ public ParquetValueReader<?> primitive(
switch (primitive.getPrimitiveTypeName()) {
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
return new ParquetValueReaders.ByteArrayReader(desc);
if (expected.typeId() == Types.StringType.get().typeId()) {
return new StringReader(desc);
} else {
return new ParquetValueReaders.ByteArrayReader(desc);
}
case INT32:
if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) {
return new ParquetValueReaders.IntAsLongReader(desc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.binary.BinaryStringData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.Files;
Expand Down Expand Up @@ -98,6 +100,62 @@ public void testTwoLevelList() throws IOException {
}
}

@Test
public void testReadBinaryFieldAsString() throws IOException {
Schema schemaForWriteBinary = new Schema(optional(1, "strbytes", Types.BinaryType.get()));
org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schemaForWriteBinary.asStruct());

File testFile = temp.newFile();
Assert.assertTrue(testFile.delete());

ParquetWriter<GenericRecord> writer =
AvroParquetWriter.<GenericRecord>builder(new Path(testFile.toURI()))
.withDataModel(GenericData.get())
.withSchema(avroSchema)
.build();

String expectedString = "hello";

GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema);
ByteBuffer expectedBinary = ByteBuffer.wrap(expectedString.getBytes(StandardCharsets.UTF_8));
recordBuilder.set("strbytes", expectedBinary);
GenericData.Record expectedRecord = recordBuilder.build();

writer.write(expectedRecord);
writer.close();

// read as string
Schema schemaForReadBinaryAsString =
new Schema(optional(1, "strbytes", Types.StringType.get()));
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForReadBinaryAsString)
.createReaderFunc(
type -> FlinkParquetReaders.buildReader(schemaForReadBinaryAsString, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertTrue(rowData.getString(0) instanceof BinaryStringData);
Assert.assertEquals(expectedString, rowData.getString(0).toString());
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}

// read as byte[]
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForWriteBinary)
.createReaderFunc(type -> FlinkParquetReaders.buildReader(schemaForWriteBinary, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertArrayEquals(
expectedString.getBytes(StandardCharsets.UTF_8), rowData.getBinary(0));
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}
}

private void writeAndValidate(Iterable<Record> iterable, Schema schema) throws IOException {
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,11 @@ public ParquetValueReader<?> primitive(
switch (primitive.getPrimitiveTypeName()) {
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
return new ParquetValueReaders.ByteArrayReader(desc);
if (expected.typeId() == Types.StringType.get().typeId()) {
return new StringReader(desc);
} else {
return new ParquetValueReaders.ByteArrayReader(desc);
}
case INT32:
if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) {
return new ParquetValueReaders.IntAsLongReader(desc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.binary.BinaryStringData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.Files;
Expand Down Expand Up @@ -98,6 +100,62 @@ public void testTwoLevelList() throws IOException {
}
}

@Test
public void testReadBinaryFieldAsString() throws IOException {
Schema schemaForWriteBinary = new Schema(optional(1, "strbytes", Types.BinaryType.get()));
org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schemaForWriteBinary.asStruct());

File testFile = temp.newFile();
Assert.assertTrue(testFile.delete());

ParquetWriter<GenericRecord> writer =
AvroParquetWriter.<GenericRecord>builder(new Path(testFile.toURI()))
.withDataModel(GenericData.get())
.withSchema(avroSchema)
.build();

String expectedString = "hello";

GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema);
ByteBuffer expectedBinary = ByteBuffer.wrap(expectedString.getBytes(StandardCharsets.UTF_8));
recordBuilder.set("strbytes", expectedBinary);
GenericData.Record expectedRecord = recordBuilder.build();

writer.write(expectedRecord);
writer.close();

// read as string
Schema schemaForReadBinaryAsString =
new Schema(optional(1, "strbytes", Types.StringType.get()));
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForReadBinaryAsString)
.createReaderFunc(
type -> FlinkParquetReaders.buildReader(schemaForReadBinaryAsString, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertTrue(rowData.getString(0) instanceof BinaryStringData);
Assert.assertEquals(expectedString, rowData.getString(0).toString());
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}

// read as byte[]
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForWriteBinary)
.createReaderFunc(type -> FlinkParquetReaders.buildReader(schemaForWriteBinary, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertArrayEquals(
expectedString.getBytes(StandardCharsets.UTF_8), rowData.getBinary(0));
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}
}

private void writeAndValidate(Iterable<Record> iterable, Schema schema) throws IOException {
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,11 @@ public ParquetValueReader<?> primitive(
switch (primitive.getPrimitiveTypeName()) {
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
return new ParquetValueReaders.ByteArrayReader(desc);
if (expected.typeId() == Types.StringType.get().typeId()) {
return new StringReader(desc);
} else {
return new ParquetValueReaders.ByteArrayReader(desc);
}
case INT32:
if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) {
return new ParquetValueReaders.IntAsLongReader(desc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.binary.BinaryStringData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.Files;
Expand Down Expand Up @@ -98,6 +100,62 @@ public void testTwoLevelList() throws IOException {
}
}

@Test
public void testReadBinaryFieldAsString() throws IOException {
Schema schemaForWriteBinary = new Schema(optional(1, "strbytes", Types.BinaryType.get()));
org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schemaForWriteBinary.asStruct());

File testFile = temp.newFile();
Assert.assertTrue(testFile.delete());

ParquetWriter<GenericRecord> writer =
AvroParquetWriter.<GenericRecord>builder(new Path(testFile.toURI()))
.withDataModel(GenericData.get())
.withSchema(avroSchema)
.build();

String expectedString = "hello";

GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema);
ByteBuffer expectedBinary = ByteBuffer.wrap(expectedString.getBytes(StandardCharsets.UTF_8));
recordBuilder.set("strbytes", expectedBinary);
GenericData.Record expectedRecord = recordBuilder.build();

writer.write(expectedRecord);
writer.close();

// read as string
Schema schemaForReadBinaryAsString =
new Schema(optional(1, "strbytes", Types.StringType.get()));
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForReadBinaryAsString)
.createReaderFunc(
type -> FlinkParquetReaders.buildReader(schemaForReadBinaryAsString, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertTrue(rowData.getString(0) instanceof BinaryStringData);
Assert.assertEquals(expectedString, rowData.getString(0).toString());
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}

// read as byte[]
try (CloseableIterable<RowData> reader =
Parquet.read(Files.localInput(testFile))
.project(schemaForWriteBinary)
.createReaderFunc(type -> FlinkParquetReaders.buildReader(schemaForWriteBinary, type))
.build()) {
Iterator<RowData> rows = reader.iterator();
Assert.assertTrue("Should have at least one row", rows.hasNext());
RowData rowData = rows.next();
Assert.assertArrayEquals(
expectedString.getBytes(StandardCharsets.UTF_8), rowData.getBinary(0));
Assert.assertFalse("Should not have more than one row", rows.hasNext());
}
}

private void writeAndValidate(Iterable<Record> iterable, Schema schema) throws IOException {
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
Expand Down

0 comments on commit c446df6

Please sign in to comment.