From f9a9427dac16b7d59ee3e061cc1131b8bf1ba09d Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Sun, 4 Feb 2024 08:22:25 -0800 Subject: [PATCH 01/15] Kafka Connect: Record converters and delta writers --- .../connect/events/TableReference.java | 7 +- .../iceberg/connect/IcebergSinkConfig.java | 29 +- .../iceberg/connect/IcebergSinkConnector.java | 5 +- .../connect/data/BaseDeltaTaskWriter.java | 102 +++ .../iceberg/connect/data/IcebergWriter.java | 63 +- .../iceberg/connect/data/Operation.java | 25 + .../connect/data/PartitionedDeltaWriter.java | 93 ++ .../iceberg/connect/data/RecordConverter.java | 508 +++++++++++ .../connect/data/RecordProjection.java | 199 +++++ .../iceberg/connect/data/RecordWrapper.java | 83 ++ .../data/UnpartitionedDeltaWriter.java | 66 ++ .../iceberg/connect/data/Utilities.java | 72 +- .../connect/IcebergSinkConnectorTest.java | 4 +- .../data/PartitionedDeltaWriterTest.java | 70 ++ .../connect/data/RecordConverterTest.java | 828 ++++++++++++++++++ .../data/UnpartitionedDeltaWriterTest.java | 62 ++ 16 files changed, 2171 insertions(+), 45 deletions(-) create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java create mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java create mode 100644 kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java create mode 100644 kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java create mode 100644 kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java diff --git a/kafka-connect/kafka-connect-events/src/main/java/org/apache/iceberg/connect/events/TableReference.java b/kafka-connect/kafka-connect-events/src/main/java/org/apache/iceberg/connect/events/TableReference.java index d1400f58f74c..50eaa1050485 100644 --- a/kafka-connect/kafka-connect-events/src/main/java/org/apache/iceberg/connect/events/TableReference.java +++ b/kafka-connect/kafka-connect-events/src/main/java/org/apache/iceberg/connect/events/TableReference.java @@ -18,10 +18,9 @@ */ package org.apache.iceberg.connect.events; -import static java.util.stream.Collectors.toList; - import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.util.Utf8; @@ -96,7 +95,9 @@ public void put(int i, Object v) { return; case NAMESPACE: this.namespace = - v == null ? null : ((List) v).stream().map(Utf8::toString).collect(toList()); + v == null + ? null + : ((List) v).stream().map(Utf8::toString).collect(Collectors.toList()); return; case NAME: this.name = v == null ? null : v.toString(); diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java index aa1ecdd5d1ba..dd9f9e401446 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java @@ -18,8 +18,6 @@ */ package org.apache.iceberg.connect; -import static java.util.stream.Collectors.toList; - import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; @@ -28,6 +26,7 @@ import java.util.Map; import java.util.Properties; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.iceberg.IcebergBuild; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -72,7 +71,9 @@ public class IcebergSinkConfig extends AbstractConfig { private static final String TABLES_DEFAULT_COMMIT_BRANCH = "iceberg.tables.default-commit-branch"; private static final String TABLES_DEFAULT_ID_COLUMNS = "iceberg.tables.default-id-columns"; private static final String TABLES_DEFAULT_PARTITION_BY = "iceberg.tables.default-partition-by"; - // FIXME: add config for CDC and upsert mode + private static final String TABLES_CDC_FIELD_PROP = "iceberg.tables.cdc-field"; + private static final String TABLES_UPSERT_MODE_ENABLED_PROP = + "iceberg.tables.upsert-mode-enabled"; private static final String TABLES_AUTO_CREATE_ENABLED_PROP = "iceberg.tables.auto-create-enabled"; private static final String TABLES_EVOLVE_SCHEMA_ENABLED_PROP = @@ -151,6 +152,18 @@ private static ConfigDef newConfigDef() { null, Importance.MEDIUM, "Default partition spec to use when creating tables, comma-separated"); + configDef.define( + TABLES_CDC_FIELD_PROP, + ConfigDef.Type.STRING, + null, + Importance.MEDIUM, + "Source record field that identifies the type of operation (insert, update, or delete)"); + configDef.define( + TABLES_UPSERT_MODE_ENABLED_PROP, + ConfigDef.Type.BOOLEAN, + false, + Importance.MEDIUM, + "Set to true to treat all appends as upserts, false otherwise"); configDef.define( TABLES_AUTO_CREATE_ENABLED_PROP, ConfigDef.Type.BOOLEAN, @@ -365,7 +378,7 @@ static List stringToList(String value, String regex) { return ImmutableList.of(); } - return Arrays.stream(value.split(regex)).map(String::trim).collect(toList()); + return Arrays.stream(value.split(regex)).map(String::trim).collect(Collectors.toList()); } public String controlTopic() { @@ -409,6 +422,14 @@ public String hadoopConfDir() { return getString(HADDOP_CONF_DIR_PROP); } + public String tablesCdcField() { + return getString(TABLES_CDC_FIELD_PROP); + } + + public boolean upsertModeEnabled() { + return getBoolean(TABLES_UPSERT_MODE_ENABLED_PROP); + } + public boolean autoCreateEnabled() { return getBoolean(TABLES_AUTO_CREATE_ENABLED_PROP); } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConnector.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConnector.java index 485b209302d5..8be8518f4407 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConnector.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConnector.java @@ -18,11 +18,10 @@ */ package org.apache.iceberg.connect; -import static java.util.stream.Collectors.toList; - import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.kafka.common.config.ConfigDef; @@ -60,7 +59,7 @@ public List> taskConfigs(int maxTasks) { map.put(IcebergSinkConfig.INTERNAL_TRANSACTIONAL_SUFFIX_PROP, txnSuffix + i); return map; }) - .collect(toList()); + .collect(Collectors.toList()); } @Override diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java new file mode 100644 index 000000000000..09cedea693f1 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.io.IOException; +import java.util.Set; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; + +abstract class BaseDeltaTaskWriter extends BaseTaskWriter { + + private final Schema schema; + private final Schema deleteSchema; + private final InternalRecordWrapper wrapper; + private final InternalRecordWrapper keyWrapper; + private final RecordProjection keyProjection; + private final boolean upsertMode; + + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsertMode) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.schema = schema; + this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(identifierFieldIds)); + this.wrapper = new InternalRecordWrapper(schema.asStruct()); + this.keyWrapper = new InternalRecordWrapper(deleteSchema.asStruct()); + this.keyProjection = RecordProjection.create(schema, deleteSchema); + this.upsertMode = upsertMode; + } + + abstract RowDataDeltaWriter route(Record row); + + InternalRecordWrapper wrapper() { + return wrapper; + } + + @Override + public void write(Record row) throws IOException { + Operation op = + row instanceof RecordWrapper + ? ((RecordWrapper) row).op() + : upsertMode ? Operation.UPDATE : Operation.INSERT; + RowDataDeltaWriter writer = route(row); + if (op == Operation.UPDATE || op == Operation.DELETE) { + writer.deleteKey(keyProjection.wrap(row)); + } + if (op == Operation.UPDATE || op == Operation.INSERT) { + writer.write(row); + } + } + + class RowDataDeltaWriter extends BaseEqualityDeltaWriter { + + RowDataDeltaWriter(PartitionKey partition) { + super(partition, schema, deleteSchema); + } + + @Override + protected StructLike asStructLike(Record data) { + return wrapper.wrap(data); + } + + @Override + protected StructLike asStructLikeKey(Record data) { + return keyWrapper.wrap(data); + } + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java index da88b3b50ffe..d0c61e016889 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java @@ -38,8 +38,7 @@ public class IcebergWriter implements RecordWriter { private final IcebergSinkConfig config; private final List writerResults; - // FIXME: update this when the record converter is added - // private RecordConverter recordConverter; + private RecordConverter recordConverter; private TaskWriter writer; public IcebergWriter(Table table, String tableName, IcebergSinkConfig config) { @@ -52,20 +51,23 @@ public IcebergWriter(Table table, String tableName, IcebergSinkConfig config) { private void initNewWriter() { this.writer = Utilities.createTableWriter(table, tableName, config); - // FIXME: update this when the record converter is added - // this.recordConverter = new RecordConverter(table, config); + this.recordConverter = new RecordConverter(table, config); } @Override public void write(SinkRecord record) { try { + // TODO: config to handle tombstones instead of always ignoring? // TODO: config to handle tombstones instead of always ignoring? if (record.value() != null) { Record row = convertToRow(record); - - // FIXME: add CDC operation support - - writer.write(row); + String cdcField = config.tablesCdcField(); + if (cdcField == null) { + writer.write(row); + } else { + Operation op = extractCdcOperation(record.value(), cdcField); + writer.write(new RecordWrapper(row, op)); + } } } catch (Exception e) { throw new DataException( @@ -77,8 +79,49 @@ public void write(SinkRecord record) { } private Record convertToRow(SinkRecord record) { - // FIXME: update this when the record converter is added - return null; + if (!config.evolveSchemaEnabled()) { + return recordConverter.convert(record.value()); + } + + SchemaUpdate.Consumer updates = new SchemaUpdate.Consumer(); + Record row = recordConverter.convert(record.value(), updates); + + if (!updates.empty()) { + // complete the current file + flush(); + // apply the schema updates, this will refresh the table + SchemaUtils.applySchemaUpdates(table, updates); + // initialize a new writer with the new schema + initNewWriter(); + // convert the row again, this time using the new table schema + row = recordConverter.convert(record.value(), null); + } + + return row; + } + + private Operation extractCdcOperation(Object recordValue, String cdcField) { + Object opValue = Utilities.extractFromRecordValue(recordValue, cdcField); + + if (opValue == null) { + return Operation.INSERT; + } + + String opStr = opValue.toString().trim().toUpperCase(); + if (opStr.isEmpty()) { + return Operation.INSERT; + } + + // TODO: define value mapping in config? + + switch (opStr.charAt(0)) { + case 'U': + return Operation.UPDATE; + case 'D': + return Operation.DELETE; + default: + return Operation.INSERT; + } } private void flush() { diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java new file mode 100644 index 000000000000..7f428a6dc2b6 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +public enum Operation { + INSERT, + UPDATE, + DELETE +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java new file mode 100644 index 000000000000..eae5d116c6c1 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Tasks; + +public class PartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final PartitionKey partitionKey; + + private final Map writers = Maps.newHashMap(); + + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsertMode) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + identifierFieldIds, + upsertMode); + this.partitionKey = new PartitionKey(spec, schema); + } + + @Override + RowDataDeltaWriter route(Record row) { + partitionKey.partition(wrapper().wrap(row)); + + RowDataDeltaWriter writer = writers.get(partitionKey); + if (writer == null) { + // NOTE: pass a copy of the partition key to the writer to prevent it from + // being modified + PartitionKey copiedKey = partitionKey.copy(); + writer = new RowDataDeltaWriter(copiedKey); + writers.put(copiedKey, writer); + } + + return writer; + } + + @Override + public void close() { + try { + Tasks.foreach(writers.values()) + .throwFailureWhenFinished() + .noRetry() + .run(RowDataDeltaWriter::close, IOException.class); + + writers.clear(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to close equality delta writer", e); + } + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java new file mode 100644 index 000000000000..7c6ae76e9b49 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -0,0 +1,508 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.Temporal; +import java.util.Base64; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.connect.IcebergSinkConfig; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.mapping.MappedField; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types.DecimalType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.types.Types.TimestampType; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.kafka.connect.data.Struct; + +public class RecordConverter { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static final DateTimeFormatter OFFSET_TS_FMT = + new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + .appendOffset("+HHmm", "Z") + .toFormatter(); + + private final Schema tableSchema; + private final NameMapping nameMapping; + private final IcebergSinkConfig config; + private final Map> structNameMap = Maps.newHashMap(); + + public RecordConverter(Table table, IcebergSinkConfig config) { + this.tableSchema = table.schema(); + this.nameMapping = createNameMapping(table); + this.config = config; + } + + public Record convert(Object data) { + return convert(data, null); + } + + public Record convert(Object data, SchemaUpdate.Consumer schemaUpdateConsumer) { + if (data instanceof Struct || data instanceof Map) { + return convertStructValue(data, tableSchema.asStruct(), -1, schemaUpdateConsumer); + } + throw new UnsupportedOperationException("Cannot convert type: " + data.getClass().getName()); + } + + private NameMapping createNameMapping(Table table) { + String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); + return nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + } + + private Object convertValue( + Object value, Type type, int fieldId, SchemaUpdate.Consumer schemaUpdateConsumer) { + if (value == null) { + return null; + } + switch (type.typeId()) { + case STRUCT: + return convertStructValue(value, type.asStructType(), fieldId, schemaUpdateConsumer); + case LIST: + return convertListValue(value, type.asListType(), schemaUpdateConsumer); + case MAP: + return convertMapValue(value, type.asMapType(), schemaUpdateConsumer); + case INTEGER: + return convertInt(value); + case LONG: + return convertLong(value); + case FLOAT: + return convertFloat(value); + case DOUBLE: + return convertDouble(value); + case DECIMAL: + return convertDecimal(value, (DecimalType) type); + case BOOLEAN: + return convertBoolean(value); + case STRING: + return convertString(value); + case UUID: + return convertUUID(value); + case BINARY: + case FIXED: + return convertBase64Binary(value); + case DATE: + return convertDateValue(value); + case TIME: + return convertTimeValue(value); + case TIMESTAMP: + return convertTimestampValue(value, (TimestampType) type); + } + throw new UnsupportedOperationException("Unsupported type: " + type.typeId()); + } + + protected GenericRecord convertStructValue( + Object value, + StructType schema, + int parentFieldId, + SchemaUpdate.Consumer schemaUpdateConsumer) { + if (value instanceof Map) { + return convertToStruct((Map) value, schema, parentFieldId, schemaUpdateConsumer); + } else if (value instanceof Struct) { + return convertToStruct((Struct) value, schema, parentFieldId, schemaUpdateConsumer); + } + throw new IllegalArgumentException("Cannot convert to struct: " + value.getClass().getName()); + } + + private GenericRecord convertToStruct( + Map map, + StructType schema, + int structFieldId, + SchemaUpdate.Consumer schemaUpdateConsumer) { + GenericRecord result = GenericRecord.create(schema); + map.forEach( + (recordFieldNameObj, recordFieldValue) -> { + String recordFieldName = recordFieldNameObj.toString(); + NestedField tableField = lookupStructField(recordFieldName, schema, structFieldId); + if (tableField == null) { + // add the column if schema evolution is on, otherwise skip the value, + // skip the add column if we can't infer the type + if (schemaUpdateConsumer != null) { + Type type = SchemaUtils.inferIcebergType(recordFieldValue, config); + if (type != null) { + String parentFieldName = + structFieldId < 0 ? null : tableSchema.findColumnName(structFieldId); + schemaUpdateConsumer.addColumn(parentFieldName, recordFieldName, type); + } + } + } else { + result.setField( + tableField.name(), + convertValue( + recordFieldValue, + tableField.type(), + tableField.fieldId(), + schemaUpdateConsumer)); + } + }); + return result; + } + + private GenericRecord convertToStruct( + Struct struct, + StructType schema, + int structFieldId, + SchemaUpdate.Consumer schemaUpdateConsumer) { + GenericRecord result = GenericRecord.create(schema); + struct + .schema() + .fields() + .forEach( + recordField -> { + NestedField tableField = lookupStructField(recordField.name(), schema, structFieldId); + if (tableField == null) { + // add the column if schema evolution is on, otherwise skip the value + if (schemaUpdateConsumer != null) { + String parentFieldName = + structFieldId < 0 ? null : tableSchema.findColumnName(structFieldId); + Type type = SchemaUtils.toIcebergType(recordField.schema(), config); + schemaUpdateConsumer.addColumn(parentFieldName, recordField.name(), type); + } + } else { + boolean hasSchemaUpdates = false; + if (schemaUpdateConsumer != null) { + // update the type if needed and schema evolution is on + PrimitiveType evolveDataType = + SchemaUtils.needsDataTypeUpdate(tableField.type(), recordField.schema()); + if (evolveDataType != null) { + String fieldName = tableSchema.findColumnName(tableField.fieldId()); + schemaUpdateConsumer.updateType(fieldName, evolveDataType); + hasSchemaUpdates = true; + } + // make optional if needed and schema evolution is on + if (tableField.isRequired() && recordField.schema().isOptional()) { + String fieldName = tableSchema.findColumnName(tableField.fieldId()); + schemaUpdateConsumer.makeOptional(fieldName); + hasSchemaUpdates = true; + } + } + if (!hasSchemaUpdates) { + result.setField( + tableField.name(), + convertValue( + struct.get(recordField), + tableField.type(), + tableField.fieldId(), + schemaUpdateConsumer)); + } + } + }); + return result; + } + + private NestedField lookupStructField(String fieldName, StructType schema, int structFieldId) { + if (nameMapping == null) { + return config.schemaCaseInsensitive() + ? schema.caseInsensitiveField(fieldName) + : schema.field(fieldName); + } + + return structNameMap + .computeIfAbsent(structFieldId, notUsed -> createStructNameMap(schema)) + .get(fieldName); + } + + private Map createStructNameMap(StructType schema) { + Map map = Maps.newHashMap(); + schema + .fields() + .forEach( + col -> { + MappedField mappedField = nameMapping.find(col.fieldId()); + if (mappedField != null && !mappedField.names().isEmpty()) { + mappedField.names().forEach(name -> map.put(name, col)); + } else { + map.put(col.name(), col); + } + }); + return map; + } + + protected List convertListValue( + Object value, ListType type, SchemaUpdate.Consumer schemaUpdateConsumer) { + Preconditions.checkArgument(value instanceof List); + List list = (List) value; + return list.stream() + .map( + element -> { + int fieldId = type.fields().get(0).fieldId(); + return convertValue(element, type.elementType(), fieldId, schemaUpdateConsumer); + }) + .collect(Collectors.toList()); + } + + protected Map convertMapValue( + Object value, MapType type, SchemaUpdate.Consumer schemaUpdateConsumer) { + Preconditions.checkArgument(value instanceof Map); + Map map = (Map) value; + Map result = Maps.newHashMap(); + map.forEach( + (k, v) -> { + int keyFieldId = type.fields().get(0).fieldId(); + int valueFieldId = type.fields().get(1).fieldId(); + result.put( + convertValue(k, type.keyType(), keyFieldId, schemaUpdateConsumer), + convertValue(v, type.valueType(), valueFieldId, schemaUpdateConsumer)); + }); + return result; + } + + protected int convertInt(Object value) { + if (value instanceof Number) { + return ((Number) value).intValue(); + } else if (value instanceof String) { + return Integer.parseInt((String) value); + } + throw new IllegalArgumentException("Cannot convert to int: " + value.getClass().getName()); + } + + protected long convertLong(Object value) { + if (value instanceof Number) { + return ((Number) value).longValue(); + } else if (value instanceof String) { + return Long.parseLong((String) value); + } + throw new IllegalArgumentException("Cannot convert to long: " + value.getClass().getName()); + } + + protected float convertFloat(Object value) { + if (value instanceof Number) { + return ((Number) value).floatValue(); + } else if (value instanceof String) { + return Float.parseFloat((String) value); + } + throw new IllegalArgumentException("Cannot convert to float: " + value.getClass().getName()); + } + + protected double convertDouble(Object value) { + if (value instanceof Number) { + return ((Number) value).doubleValue(); + } else if (value instanceof String) { + return Double.parseDouble((String) value); + } + throw new IllegalArgumentException("Cannot convert to double: " + value.getClass().getName()); + } + + protected BigDecimal convertDecimal(Object value, DecimalType type) { + BigDecimal bigDecimal; + if (value instanceof BigDecimal) { + bigDecimal = (BigDecimal) value; + } else if (value instanceof Number) { + Number num = (Number) value; + Double dbl = num.doubleValue(); + if (dbl.equals(Math.floor(dbl))) { + bigDecimal = BigDecimal.valueOf(num.longValue()); + } else { + bigDecimal = BigDecimal.valueOf(dbl); + } + } else if (value instanceof String) { + bigDecimal = new BigDecimal((String) value); + } else { + throw new IllegalArgumentException( + "Cannot convert to BigDecimal: " + value.getClass().getName()); + } + return bigDecimal.setScale(type.scale(), RoundingMode.HALF_UP); + } + + protected boolean convertBoolean(Object value) { + if (value instanceof Boolean) { + return (boolean) value; + } else if (value instanceof String) { + return Boolean.parseBoolean((String) value); + } + throw new IllegalArgumentException("Cannot convert to boolean: " + value.getClass().getName()); + } + + protected String convertString(Object value) { + try { + if (value instanceof String) { + return (String) value; + } else if (value instanceof Number || value instanceof Boolean) { + return value.toString(); + } else if (value instanceof Map || value instanceof List) { + return MAPPER.writeValueAsString(value); + } else if (value instanceof Struct) { + Struct struct = (Struct) value; + byte[] data = config.jsonConverter().fromConnectData(null, struct.schema(), struct); + return new String(data, StandardCharsets.UTF_8); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + throw new IllegalArgumentException("Cannot convert to string: " + value.getClass().getName()); + } + + protected UUID convertUUID(Object value) { + if (value instanceof String) { + return UUID.fromString((String) value); + } else if (value instanceof UUID) { + return (UUID) value; + } + throw new IllegalArgumentException("Cannot convert to UUID: " + value.getClass().getName()); + } + + protected ByteBuffer convertBase64Binary(Object value) { + if (value instanceof String) { + return ByteBuffer.wrap(Base64.getDecoder().decode((String) value)); + } else if (value instanceof byte[]) { + return ByteBuffer.wrap((byte[]) value); + } else if (value instanceof ByteBuffer) { + return (ByteBuffer) value; + } + throw new IllegalArgumentException("Cannot convert to binary: " + value.getClass().getName()); + } + + @SuppressWarnings("JavaUtilDate") + protected LocalDate convertDateValue(Object value) { + if (value instanceof Number) { + int days = ((Number) value).intValue(); + return DateTimeUtil.dateFromDays(days); + } else if (value instanceof String) { + return LocalDate.parse((String) value); + } else if (value instanceof LocalDate) { + return (LocalDate) value; + } else if (value instanceof Date) { + int days = (int) (((Date) value).getTime() / 1000 / 60 / 60 / 24); + return DateTimeUtil.dateFromDays(days); + } + throw new RuntimeException("Cannot convert date: " + value); + } + + @SuppressWarnings("JavaUtilDate") + protected LocalTime convertTimeValue(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timeFromMicros(millis * 1000); + } else if (value instanceof String) { + return LocalTime.parse((String) value); + } else if (value instanceof LocalTime) { + return (LocalTime) value; + } else if (value instanceof Date) { + long millis = ((Date) value).getTime(); + return DateTimeUtil.timeFromMicros(millis * 1000); + } + throw new RuntimeException("Cannot convert time: " + value); + } + + protected Temporal convertTimestampValue(Object value, TimestampType type) { + if (type.shouldAdjustToUTC()) { + return convertOffsetDateTime(value); + } + return convertLocalDateTime(value); + } + + @SuppressWarnings("JavaUtilDate") + private OffsetDateTime convertOffsetDateTime(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timestamptzFromMicros(millis * 1000); + } else if (value instanceof String) { + return parseOffsetDateTime((String) value); + } else if (value instanceof OffsetDateTime) { + return (OffsetDateTime) value; + } else if (value instanceof LocalDateTime) { + return ((LocalDateTime) value).atOffset(ZoneOffset.UTC); + } else if (value instanceof Date) { + return DateTimeUtil.timestamptzFromMicros(((Date) value).getTime() * 1000); + } + throw new RuntimeException( + "Cannot convert timestamptz: " + value + ", type: " + value.getClass()); + } + + private OffsetDateTime parseOffsetDateTime(String str) { + String tsStr = ensureTimestampFormat(str); + try { + return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from); + } catch (DateTimeParseException e) { + return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME) + .atOffset(ZoneOffset.UTC); + } + } + + @SuppressWarnings("JavaUtilDate") + private LocalDateTime convertLocalDateTime(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timestampFromMicros(millis * 1000); + } else if (value instanceof String) { + return parseLocalDateTime((String) value); + } else if (value instanceof LocalDateTime) { + return (LocalDateTime) value; + } else if (value instanceof OffsetDateTime) { + return ((OffsetDateTime) value).toLocalDateTime(); + } else if (value instanceof Date) { + return DateTimeUtil.timestampFromMicros(((Date) value).getTime() * 1000); + } + throw new RuntimeException( + "Cannot convert timestamp: " + value + ", type: " + value.getClass()); + } + + private LocalDateTime parseLocalDateTime(String str) { + String tsStr = ensureTimestampFormat(str); + try { + return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME); + } catch (DateTimeParseException e) { + return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from).toLocalDateTime(); + } + } + + private String ensureTimestampFormat(String str) { + String result = str; + if (result.charAt(10) == ' ') { + result = result.substring(0, 10) + 'T' + result.substring(11); + } + if (result.length() > 22 && result.charAt(19) == '+' && result.charAt(22) == ':') { + result = result.substring(0, 19) + result.substring(19).replace(":", ""); + } + return result; + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java new file mode 100644 index 000000000000..79ce2c111a3a --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; + +/** + * This is modified from {@link org.apache.iceberg.util.StructProjection} to support record types. + */ +public class RecordProjection implements Record { + + /** + * Creates a projecting wrapper for {@link Record} rows. + * + *

This projection does not work with repeated types like lists and maps. + * + * @param dataSchema schema of rows wrapped by this projection + * @param projectedSchema result schema of the projected rows + * @return a wrapper to project rows + */ + public static RecordProjection create(Schema dataSchema, Schema projectedSchema) { + return new RecordProjection(dataSchema.asStruct(), projectedSchema.asStruct()); + } + + private final StructType type; + private final int[] positionMap; + private final RecordProjection[] nestedProjections; + private Record record; + + private RecordProjection(StructType structType, StructType projection) { + this(structType, projection, false); + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private RecordProjection(StructType structType, StructType projection, boolean allowMissing) { + this.type = projection; + this.positionMap = new int[projection.fields().size()]; + this.nestedProjections = new RecordProjection[projection.fields().size()]; + + // set up the projection positions and any nested projections that are needed + List dataFields = structType.fields(); + for (int pos = 0; pos < positionMap.length; pos += 1) { + NestedField projectedField = projection.fields().get(pos); + + boolean found = false; + for (int i = 0; !found && i < dataFields.size(); i += 1) { + NestedField dataField = dataFields.get(i); + if (projectedField.fieldId() == dataField.fieldId()) { + found = true; + positionMap[pos] = i; + switch (projectedField.type().typeId()) { + case STRUCT: + nestedProjections[pos] = + new RecordProjection( + dataField.type().asStructType(), projectedField.type().asStructType()); + break; + case MAP: + MapType projectedMap = projectedField.type().asMapType(); + MapType originalMap = dataField.type().asMapType(); + + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, + "Cannot project a partial map key or value struct. Trying to project %s out of %s", + projectedField, + dataField); + + nestedProjections[pos] = null; + break; + case LIST: + ListType projectedList = projectedField.type().asListType(); + ListType originalList = dataField.type().asListType(); + + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, + "Cannot project a partial list element struct. Trying to project %s out of %s", + projectedField, + dataField); + + nestedProjections[pos] = null; + break; + default: + nestedProjections[pos] = null; + } + } + } + + if (!found && projectedField.isOptional() && allowMissing) { + positionMap[pos] = -1; + nestedProjections[pos] = null; + } else if (!found) { + throw new IllegalArgumentException( + String.format("Cannot find field %s in %s", projectedField, structType)); + } + } + } + + public RecordProjection wrap(Record newRecord) { + this.record = newRecord; + return this; + } + + @Override + public int size() { + return type.fields().size(); + } + + @Override + public T get(int pos, Class javaClass) { + // struct can be null if wrap is not called first before the get call + // or if a null struct is wrapped. + if (record == null) { + return null; + } + + int recordPos = positionMap[pos]; + if (nestedProjections[pos] != null) { + Record nestedStruct = record.get(recordPos, Record.class); + if (nestedStruct == null) { + return null; + } + + return javaClass.cast(nestedProjections[pos].wrap(nestedStruct)); + } + + if (recordPos != -1) { + return record.get(recordPos, javaClass); + } else { + return null; + } + } + + @Override + public void set(int pos, T value) { + throw new UnsupportedOperationException(); + } + + @Override + public StructType struct() { + return type; + } + + @Override + public Object getField(String name) { + throw new UnsupportedOperationException(); + } + + @Override + public void setField(String name, Object value) { + throw new UnsupportedOperationException(); + } + + @Override + public Object get(int pos) { + return get(pos, Object.class); + } + + @Override + public Record copy() { + throw new UnsupportedOperationException(); + } + + @Override + public Record copy(Map overwriteValues) { + throw new UnsupportedOperationException(); + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java new file mode 100644 index 000000000000..915608562034 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.util.Map; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.types.Types.StructType; + +public class RecordWrapper implements Record { + + private final Record delegate; + private final Operation op; + + public RecordWrapper(Record delegate, Operation op) { + this.delegate = delegate; + this.op = op; + } + + public Operation op() { + return op; + } + + @Override + public StructType struct() { + return delegate.struct(); + } + + @Override + public Object getField(String name) { + return delegate.getField(name); + } + + @Override + public void setField(String name, Object value) { + delegate.setField(name, value); + } + + @Override + public Object get(int pos) { + return delegate.get(pos); + } + + @Override + public Record copy() { + return new RecordWrapper(delegate.copy(), op); + } + + @Override + public Record copy(Map overwriteValues) { + return new RecordWrapper(delegate.copy(overwriteValues), op); + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public T get(int pos, Class javaClass) { + return delegate.get(pos, javaClass); + } + + @Override + public void set(int pos, T value) { + delegate.set(pos, value); + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java new file mode 100644 index 000000000000..46b0a45d532d --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.io.IOException; +import java.util.Set; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; + +public class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final RowDataDeltaWriter writer; + + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsertMode) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + identifierFieldIds, + upsertMode); + this.writer = new RowDataDeltaWriter(null); + } + + @Override + RowDataDeltaWriter route(Record row) { + return writer; + } + + @Override + public void close() throws IOException { + writer.close(); + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java index ec13b003a21a..2de58d0d7486 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java @@ -18,12 +18,6 @@ */ package org.apache.iceberg.connect.data; -import static java.util.stream.Collectors.toSet; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT; - import java.io.IOException; import java.net.URL; import java.nio.file.Files; @@ -33,9 +27,11 @@ import java.util.Map; import java.util.Set; import java.util.UUID; +import java.util.stream.Collectors; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.common.DynClasses; import org.apache.iceberg.common.DynConstructors; @@ -175,12 +171,16 @@ public static TaskWriter createTableWriter( Map tableProps = Maps.newHashMap(table.properties()); tableProps.putAll(config.writeProps()); - String formatStr = tableProps.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String formatStr = + tableProps.getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); FileFormat format = FileFormat.fromString(formatStr); long targetFileSize = PropertyUtil.propertyAsLong( - tableProps, WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + tableProps, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); Set identifierFieldIds = table.schema().identifierFieldIds(); @@ -197,7 +197,7 @@ public static TaskWriter createTableWriter( } return field.fieldId(); }) - .collect(toSet()); + .collect(Collectors.toSet()); } FileAppenderFactory appenderFactory; @@ -224,23 +224,49 @@ public static TaskWriter createTableWriter( .format(format) .build(); - // FIXME: add delta writers - TaskWriter writer; if (table.spec().isUnpartitioned()) { - writer = - new UnpartitionedWriter<>( - table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); + if (config.tablesCdcField() == null && !config.upsertModeEnabled()) { + writer = + new UnpartitionedWriter<>( + table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); + } else { + writer = + new UnpartitionedDeltaWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema(), + identifierFieldIds, + config.upsertModeEnabled()); + } } else { - writer = - new PartitionedAppendWriter( - table.spec(), - format, - appenderFactory, - fileFactory, - table.io(), - targetFileSize, - table.schema()); + if (config.tablesCdcField() == null && !config.upsertModeEnabled()) { + writer = + new PartitionedAppendWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema()); + } else { + writer = + new PartitionedDeltaWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema(), + identifierFieldIds, + config.upsertModeEnabled()); + } } return writer; } diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/IcebergSinkConnectorTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/IcebergSinkConnectorTest.java index 86502794b224..c8f563a13911 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/IcebergSinkConnectorTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/IcebergSinkConnectorTest.java @@ -18,7 +18,6 @@ */ package org.apache.iceberg.connect; -import static org.apache.iceberg.connect.IcebergSinkConfig.INTERNAL_TRANSACTIONAL_SUFFIX_PROP; import static org.assertj.core.api.Assertions.assertThat; import java.util.List; @@ -35,6 +34,7 @@ public void testTaskConfigs() { connector.start(ImmutableMap.of()); List> configs = connector.taskConfigs(3); assertThat(configs).hasSize(3); - configs.forEach(map -> assertThat(map).containsKey(INTERNAL_TRANSACTIONAL_SUFFIX_PROP)); + configs.forEach( + map -> assertThat(map).containsKey(IcebergSinkConfig.INTERNAL_TRANSACTIONAL_SUFFIX_PROP)); } } diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java new file mode 100644 index 000000000000..b8b7f0b66f95 --- /dev/null +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.connect.IcebergSinkConfig; +import org.apache.iceberg.connect.TableSinkConfig; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class PartitionedDeltaWriterTest extends BaseWriterTest { + + @ParameterizedTest + @ValueSource(strings = {"parquet", "orc"}) + public void testPartitionedDeltaWriter(String format) { + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.upsertModeEnabled()).thenReturn(true); + when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); + when(config.writeProps()).thenReturn(ImmutableMap.of("write.format.default", format)); + + when(table.spec()).thenReturn(SPEC); + + Record row1 = GenericRecord.create(SCHEMA); + row1.setField("id", 123L); + row1.setField("data", "hello world!"); + row1.setField("id2", 123L); + + Record row2 = GenericRecord.create(SCHEMA); + row2.setField("id", 234L); + row2.setField("data", "foobar"); + row2.setField("id2", 234L); + + WriteResult result = + writeTest(ImmutableList.of(row1, row2), config, PartitionedDeltaWriter.class); + + // in upsert mode, each write is a delete + append, so we'll have 1 data file + // and 1 delete file for each partition (2 total) + assertThat(result.dataFiles()).hasSize(2); + assertThat(result.dataFiles()).allMatch(file -> file.format() == FileFormat.fromString(format)); + assertThat(result.deleteFiles()).hasSize(2); + assertThat(result.deleteFiles()) + .allMatch(file -> file.format() == FileFormat.fromString(format)); + } +} diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java new file mode 100644 index 000000000000..a61e7324a778 --- /dev/null +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -0,0 +1,828 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import com.fasterxml.jackson.databind.ObjectMapper; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.temporal.Temporal; +import java.util.Base64; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Function; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.connect.IcebergSinkConfig; +import org.apache.iceberg.connect.data.SchemaUpdate.AddColumn; +import org.apache.iceberg.connect.data.SchemaUpdate.UpdateType; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.mapping.MappedField; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.BinaryType; +import org.apache.iceberg.types.Types.DateType; +import org.apache.iceberg.types.Types.DecimalType; +import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.FloatType; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.types.Types.TimeType; +import org.apache.iceberg.types.Types.TimestampType; +import org.apache.kafka.connect.data.Decimal; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.data.Time; +import org.apache.kafka.connect.data.Timestamp; +import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.json.JsonConverterConfig; +import org.apache.kafka.connect.storage.ConverterConfig; +import org.apache.kafka.connect.storage.ConverterType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class RecordConverterTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static final org.apache.iceberg.Schema SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required(21, "i", IntegerType.get()), + Types.NestedField.required(22, "l", LongType.get()), + Types.NestedField.required(23, "d", DateType.get()), + Types.NestedField.required(24, "t", TimeType.get()), + Types.NestedField.required(25, "ts", TimestampType.withoutZone()), + Types.NestedField.required(26, "tsz", TimestampType.withZone()), + Types.NestedField.required(27, "fl", FloatType.get()), + Types.NestedField.required(28, "do", DoubleType.get()), + Types.NestedField.required(29, "dec", DecimalType.of(9, 2)), + Types.NestedField.required(30, "s", StringType.get()), + Types.NestedField.required(31, "u", Types.UUIDType.get()), + Types.NestedField.required(32, "f", Types.FixedType.ofLength(3)), + Types.NestedField.required(33, "b", BinaryType.get()), + Types.NestedField.required(34, "li", ListType.ofRequired(35, StringType.get())), + Types.NestedField.required( + 36, "ma", MapType.ofRequired(37, 38, StringType.get(), StringType.get())), + Types.NestedField.optional(39, "extra", StringType.get())); + + // we have 1 unmapped column so exclude that from the count + private static final int MAPPED_CNT = SCHEMA.columns().size() - 1; + + private static final org.apache.iceberg.Schema NESTED_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required(1, "ii", IntegerType.get()), + Types.NestedField.required(2, "st", SCHEMA.asStruct())); + + private static final org.apache.iceberg.Schema SIMPLE_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required(1, "ii", IntegerType.get()), + Types.NestedField.required(2, "st", StringType.get())); + + private static final org.apache.iceberg.Schema ID_SCHEMA = + new org.apache.iceberg.Schema(Types.NestedField.required(1, "ii", IntegerType.get())); + + private static final org.apache.iceberg.Schema STRUCT_IN_LIST_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required( + 100, "stli", ListType.ofRequired(101, NESTED_SCHEMA.asStruct()))); + + private static final org.apache.iceberg.Schema STRUCT_IN_LIST_BASIC_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required(100, "stli", ListType.ofRequired(101, ID_SCHEMA.asStruct()))); + + private static final org.apache.iceberg.Schema STRUCT_IN_MAP_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required( + 100, + "stma", + MapType.ofRequired(101, 102, StringType.get(), NESTED_SCHEMA.asStruct()))); + + private static final org.apache.iceberg.Schema STRUCT_IN_MAP_BASIC_SCHEMA = + new org.apache.iceberg.Schema( + Types.NestedField.required( + 100, "stma", MapType.ofRequired(101, 102, StringType.get(), ID_SCHEMA.asStruct()))); + + private static final Schema CONNECT_SCHEMA = + SchemaBuilder.struct() + .field("i", Schema.INT32_SCHEMA) + .field("l", Schema.INT64_SCHEMA) + .field("d", org.apache.kafka.connect.data.Date.SCHEMA) + .field("t", Time.SCHEMA) + .field("ts", Timestamp.SCHEMA) + .field("tsz", Timestamp.SCHEMA) + .field("fl", Schema.FLOAT32_SCHEMA) + .field("do", Schema.FLOAT64_SCHEMA) + .field("dec", Decimal.schema(2)) + .field("s", Schema.STRING_SCHEMA) + .field("u", Schema.STRING_SCHEMA) + .field("f", Schema.BYTES_SCHEMA) + .field("b", Schema.BYTES_SCHEMA) + .field("li", SchemaBuilder.array(Schema.STRING_SCHEMA)) + .field("ma", SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA)); + + private static final Schema CONNECT_NESTED_SCHEMA = + SchemaBuilder.struct().field("ii", Schema.INT32_SCHEMA).field("st", CONNECT_SCHEMA); + + private static final Schema CONNECT_STRUCT_IN_LIST_SCHEMA = + SchemaBuilder.struct().field("stli", SchemaBuilder.array(CONNECT_NESTED_SCHEMA)).build(); + + private static final Schema CONNECT_STRUCT_IN_MAP_SCHEMA = + SchemaBuilder.struct() + .field("stma", SchemaBuilder.map(Schema.STRING_SCHEMA, CONNECT_NESTED_SCHEMA)) + .build(); + + private static final LocalDate DATE_VAL = LocalDate.parse("2023-05-18"); + private static final LocalTime TIME_VAL = LocalTime.parse("07:14:21"); + private static final LocalDateTime TS_VAL = LocalDateTime.parse("2023-05-18T07:14:21"); + private static final OffsetDateTime TSZ_VAL = OffsetDateTime.parse("2023-05-18T07:14:21Z"); + private static final BigDecimal DEC_VAL = new BigDecimal("12.34"); + private static final String STR_VAL = "foobar"; + private static final UUID UUID_VAL = UUID.randomUUID(); + private static final ByteBuffer BYTES_VAL = ByteBuffer.wrap(new byte[] {1, 2, 3}); + private static final List LIST_VAL = ImmutableList.of("hello", "world"); + private static final Map MAP_VAL = ImmutableMap.of("one", "1", "two", "2"); + + private static final JsonConverter JSON_CONVERTER = new JsonConverter(); + + static { + JSON_CONVERTER.configure( + ImmutableMap.of( + JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, + false, + ConverterConfig.TYPE_CONFIG, + ConverterType.VALUE.getName())); + } + + private IcebergSinkConfig config; + + @BeforeEach + public void before() { + this.config = mock(IcebergSinkConfig.class); + when(config.jsonConverter()).thenReturn(JSON_CONVERTER); + } + + @Test + public void testMapConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map data = createMapData(); + Record record = converter.convert(data); + assertRecordValues(record); + } + + @Test + public void testNestedMapConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(NESTED_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map nestedData = createNestedMapData(); + Record record = converter.convert(nestedData); + assertNestedRecordValues(record); + } + + @Test + @SuppressWarnings("unchecked") + public void testMapToString() throws Exception { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map nestedData = createNestedMapData(); + Record record = converter.convert(nestedData); + + String str = (String) record.getField("st"); + Map map = (Map) MAPPER.readValue(str, Map.class); + assertThat(map).hasSize(MAPPED_CNT); + } + + @Test + public void testMapValueInListConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_LIST_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map data = createNestedMapData(); + Record record = converter.convert(ImmutableMap.of("stli", ImmutableList.of(data, data))); + List fieldVal = (List) record.getField("stli"); + + Record elementVal = (Record) fieldVal.get(0); + assertNestedRecordValues(elementVal); + } + + @Test + public void testMapValueInMapConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_MAP_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map data = createNestedMapData(); + Record record = + converter.convert(ImmutableMap.of("stma", ImmutableMap.of("key1", data, "key2", data))); + + Map fieldVal = (Map) record.getField("stma"); + Record mapVal = (Record) fieldVal.get("key1"); + assertNestedRecordValues(mapVal); + } + + @Test + public void testStructConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct data = createStructData(); + Record record = converter.convert(data); + assertRecordValues(record); + } + + @Test + public void testNestedStructConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(NESTED_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct nestedData = createNestedStructData(); + Record record = converter.convert(nestedData); + assertNestedRecordValues(record); + } + + @Test + @SuppressWarnings("unchecked") + public void testStructToString() throws Exception { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct nestedData = createNestedStructData(); + Record record = converter.convert(nestedData); + + String str = (String) record.getField("st"); + Map map = (Map) MAPPER.readValue(str, Map.class); + assertThat(map).hasSize(MAPPED_CNT); + } + + @Test + public void testStructValueInListConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_LIST_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct data = createNestedStructData(); + Struct struct = + new Struct(CONNECT_STRUCT_IN_LIST_SCHEMA).put("stli", ImmutableList.of(data, data)); + Record record = converter.convert(struct); + + List fieldVal = (List) record.getField("stli"); + Record elementVal = (Record) fieldVal.get(0); + assertNestedRecordValues(elementVal); + } + + @Test + public void testStructValueInMapConvert() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_MAP_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct data = createNestedStructData(); + Struct struct = + new Struct(CONNECT_STRUCT_IN_MAP_SCHEMA) + .put("stma", ImmutableMap.of("key1", data, "key2", data)); + Record record = converter.convert(struct); + + Map fieldVal = (Map) record.getField("stma"); + Record mapVal = (Record) fieldVal.get("key1"); + assertNestedRecordValues(mapVal); + } + + @Test + public void testNameMapping() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + NameMapping nameMapping = NameMapping.of(MappedField.of(1, ImmutableList.of("renamed_ii"))); + when(table.properties()) + .thenReturn( + ImmutableMap.of( + TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping))); + + RecordConverter converter = new RecordConverter(table, config); + + Map data = ImmutableMap.of("renamed_ii", 123); + Record record = converter.convert(data); + assertThat(record.getField("ii")).isEqualTo(123); + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testCaseSensitivity(boolean caseInsensitive) { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + when(config.schemaCaseInsensitive()).thenReturn(caseInsensitive); + + RecordConverter converter = new RecordConverter(table, config); + + Map mapData = ImmutableMap.of("II", 123); + Record record1 = converter.convert(mapData); + + Struct structData = + new Struct(SchemaBuilder.struct().field("II", Schema.INT32_SCHEMA).build()).put("II", 123); + Record record2 = converter.convert(structData); + + if (caseInsensitive) { + assertThat(record1.getField("ii")).isEqualTo(123); + assertThat(record2.getField("ii")).isEqualTo(123); + } else { + assertThat(record1.getField("ii")).isEqualTo(null); + assertThat(record2.getField("ii")).isEqualTo(null); + } + } + + @Test + public void testDecimalConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + RecordConverter converter = new RecordConverter(table, config); + + BigDecimal expected = new BigDecimal("123.45"); + + ImmutableList.of("123.45", 123.45d, expected) + .forEach( + input -> { + BigDecimal decimal = converter.convertDecimal(input, DecimalType.of(10, 2)); + assertThat(decimal).isEqualTo(expected); + }); + + BigDecimal expected2 = new BigDecimal(123); + + ImmutableList.of("123", 123, expected2) + .forEach( + input -> { + BigDecimal decimal = converter.convertDecimal(input, DecimalType.of(10, 0)); + assertThat(decimal).isEqualTo(expected2); + }); + } + + @Test + public void testDateConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + LocalDate expected = LocalDate.of(2023, 11, 15); + + List inputList = + ImmutableList.of( + "2023-11-15", + expected.toEpochDay(), + expected, + new Date(Duration.ofDays(expected.toEpochDay()).toMillis())); + + inputList.forEach( + input -> { + Temporal ts = converter.convertDateValue(input); + assertThat(ts).isEqualTo(expected); + }); + } + + @Test + public void testTimeConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + LocalTime expected = LocalTime.of(7, 51, 30, 888_000_000); + + List inputList = + ImmutableList.of( + "07:51:30.888", + expected.toNanoOfDay() / 1000 / 1000, + expected, + new Date(expected.toNanoOfDay() / 1000 / 1000)); + + inputList.forEach( + input -> { + Temporal ts = converter.convertTimeValue(input); + assertThat(ts).isEqualTo(expected); + }); + } + + @Test + public void testTimestampWithZoneConversion() { + OffsetDateTime expected = OffsetDateTime.parse("2023-05-18T11:22:33Z"); + long expectedMillis = expected.toInstant().toEpochMilli(); + convertToTimestamps(expected, expectedMillis, TimestampType.withZone()); + } + + @Test + public void testTimestampWithoutZoneConversion() { + LocalDateTime expected = LocalDateTime.parse("2023-05-18T11:22:33"); + long expectedMillis = expected.atZone(ZoneOffset.UTC).toInstant().toEpochMilli(); + convertToTimestamps(expected, expectedMillis, TimestampType.withoutZone()); + } + + private void convertToTimestamps(Temporal expected, long expectedMillis, TimestampType type) { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + List inputList = + ImmutableList.of( + "2023-05-18T11:22:33Z", + "2023-05-18 11:22:33Z", + "2023-05-18T11:22:33+00", + "2023-05-18 11:22:33+00", + "2023-05-18T11:22:33+00:00", + "2023-05-18 11:22:33+00:00", + "2023-05-18T11:22:33+0000", + "2023-05-18 11:22:33+0000", + "2023-05-18T11:22:33", + "2023-05-18 11:22:33", + expectedMillis, + new Date(expectedMillis), + OffsetDateTime.ofInstant(Instant.ofEpochMilli(expectedMillis), ZoneOffset.UTC), + LocalDateTime.ofInstant(Instant.ofEpochMilli(expectedMillis), ZoneOffset.UTC)); + + inputList.forEach( + input -> { + Temporal ts = converter.convertTimestampValue(input, type); + assertThat(ts).isEqualTo(expected); + }); + } + + @Test + public void testMissingColumnDetectionMap() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(ID_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map data = Maps.newHashMap(createMapData()); + data.put("null", null); + + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(data, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(15); + + Map newColMap = Maps.newHashMap(); + addCols.forEach(addCol -> newColMap.put(addCol.name(), addCol)); + + assertTypesAddedFromMap(col -> newColMap.get(col).type()); + + // null values should be ignored + assertThat(newColMap).doesNotContainKey("null"); + } + + @Test + public void testMissingColumnDetectionMapNested() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(ID_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map nestedData = createNestedMapData(); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(nestedData, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(1); + + AddColumn addCol = addCols.iterator().next(); + assertThat(addCol.name()).isEqualTo("st"); + + StructType addedType = addCol.type().asStructType(); + assertThat(addedType.fields()).hasSize(15); + assertTypesAddedFromMap(col -> addedType.field(col).type()); + } + + @Test + public void testMissingColumnDetectionMapListValue() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_LIST_BASIC_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Map nestedData = createNestedMapData(); + Map map = ImmutableMap.of("stli", ImmutableList.of(nestedData, nestedData)); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(map, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(1); + + AddColumn addCol = addCols.iterator().next(); + assertThat(addCol.parentName()).isEqualTo("stli.element"); + assertThat(addCol.name()).isEqualTo("st"); + + StructType nestedElementType = addCol.type().asStructType(); + assertThat(nestedElementType.fields()).hasSize(15); + assertTypesAddedFromMap(col -> nestedElementType.field(col).type()); + } + + private void assertTypesAddedFromMap(Function fn) { + assertThat(fn.apply("i")).isInstanceOf(LongType.class); + assertThat(fn.apply("l")).isInstanceOf(LongType.class); + assertThat(fn.apply("d")).isInstanceOf(StringType.class); + assertThat(fn.apply("t")).isInstanceOf(StringType.class); + assertThat(fn.apply("ts")).isInstanceOf(StringType.class); + assertThat(fn.apply("tsz")).isInstanceOf(StringType.class); + assertThat(fn.apply("fl")).isInstanceOf(DoubleType.class); + assertThat(fn.apply("do")).isInstanceOf(DoubleType.class); + assertThat(fn.apply("dec")).isInstanceOf(StringType.class); + assertThat(fn.apply("s")).isInstanceOf(StringType.class); + assertThat(fn.apply("u")).isInstanceOf(StringType.class); + assertThat(fn.apply("f")).isInstanceOf(StringType.class); + assertThat(fn.apply("b")).isInstanceOf(StringType.class); + assertThat(fn.apply("li")).isInstanceOf(ListType.class); + assertThat(fn.apply("ma")).isInstanceOf(StructType.class); + } + + @Test + public void testMissingColumnDetectionStruct() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(ID_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct data = createStructData(); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(data, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(15); + + Map newColMap = Maps.newHashMap(); + addCols.forEach(addCol -> newColMap.put(addCol.name(), addCol)); + + assertTypesAddedFromStruct(col -> newColMap.get(col).type()); + } + + @Test + public void testMissingColumnDetectionStructNested() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(ID_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct nestedData = createNestedStructData(); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(nestedData, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(1); + + AddColumn addCol = addCols.iterator().next(); + assertThat(addCol.name()).isEqualTo("st"); + + StructType addedType = addCol.type().asStructType(); + assertThat(addedType.fields()).hasSize(15); + assertTypesAddedFromStruct(col -> addedType.field(col).type()); + } + + @Test + public void testMissingColumnDetectionStructListValue() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_LIST_BASIC_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct nestedData = createNestedStructData(); + Struct struct = + new Struct(CONNECT_STRUCT_IN_LIST_SCHEMA) + .put("stli", ImmutableList.of(nestedData, nestedData)); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(struct, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(1); + + AddColumn addCol = addCols.iterator().next(); + assertThat(addCol.parentName()).isEqualTo("stli.element"); + assertThat(addCol.name()).isEqualTo("st"); + + StructType nestedElementType = addCol.type().asStructType(); + assertThat(nestedElementType.fields()).hasSize(15); + assertTypesAddedFromStruct(col -> nestedElementType.field(col).type()); + } + + @Test + public void testMissingColumnDetectionStructMapValue() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(STRUCT_IN_MAP_BASIC_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + + Struct nestedData = createNestedStructData(); + Struct struct = + new Struct(CONNECT_STRUCT_IN_MAP_SCHEMA) + .put("stma", ImmutableMap.of("key1", nestedData, "key2", nestedData)); + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(struct, consumer); + Collection addCols = consumer.addColumns(); + + assertThat(addCols).hasSize(1); + + AddColumn addCol = addCols.iterator().next(); + assertThat(addCol.parentName()).isEqualTo("stma.value"); + assertThat(addCol.name()).isEqualTo("st"); + + StructType nestedValueType = addCol.type().asStructType(); + assertThat(nestedValueType.fields()).hasSize(15); + assertTypesAddedFromStruct(col -> nestedValueType.field(col).type()); + } + + private void assertTypesAddedFromStruct(Function fn) { + assertThat(fn.apply("i")).isInstanceOf(IntegerType.class); + assertThat(fn.apply("l")).isInstanceOf(LongType.class); + assertThat(fn.apply("d")).isInstanceOf(DateType.class); + assertThat(fn.apply("t")).isInstanceOf(TimeType.class); + assertThat(fn.apply("ts")).isInstanceOf(TimestampType.class); + assertThat(fn.apply("tsz")).isInstanceOf(TimestampType.class); + assertThat(fn.apply("fl")).isInstanceOf(FloatType.class); + assertThat(fn.apply("do")).isInstanceOf(DoubleType.class); + assertThat(fn.apply("dec")).isInstanceOf(DecimalType.class); + assertThat(fn.apply("s")).isInstanceOf(StringType.class); + assertThat(fn.apply("u")).isInstanceOf(StringType.class); + assertThat(fn.apply("f")).isInstanceOf(BinaryType.class); + assertThat(fn.apply("b")).isInstanceOf(BinaryType.class); + assertThat(fn.apply("li")).isInstanceOf(ListType.class); + assertThat(fn.apply("ma")).isInstanceOf(MapType.class); + } + + @Test + public void testEvolveTypeDetectionStruct() { + org.apache.iceberg.Schema tableSchema = + new org.apache.iceberg.Schema( + Types.NestedField.required(1, "ii", IntegerType.get()), + Types.NestedField.required(2, "ff", FloatType.get())); + + Table table = mock(Table.class); + when(table.schema()).thenReturn(tableSchema); + RecordConverter converter = new RecordConverter(table, config); + + Schema valueSchema = + SchemaBuilder.struct().field("ii", Schema.INT64_SCHEMA).field("ff", Schema.FLOAT64_SCHEMA); + Struct data = new Struct(valueSchema).put("ii", 11L).put("ff", 22d); + + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(data, consumer); + Collection updates = consumer.updateTypes(); + + assertThat(updates).hasSize(2); + + Map updateMap = Maps.newHashMap(); + updates.forEach(update -> updateMap.put(update.name(), update)); + + assertThat(updateMap.get("ii").type()).isInstanceOf(LongType.class); + assertThat(updateMap.get("ff").type()).isInstanceOf(DoubleType.class); + } + + @Test + public void testEvolveTypeDetectionStructNested() { + org.apache.iceberg.Schema structColSchema = + new org.apache.iceberg.Schema( + Types.NestedField.required(1, "ii", IntegerType.get()), + Types.NestedField.required(2, "ff", FloatType.get())); + + org.apache.iceberg.Schema tableSchema = + new org.apache.iceberg.Schema( + Types.NestedField.required(3, "i", IntegerType.get()), + Types.NestedField.required(4, "st", structColSchema.asStruct())); + + Table table = mock(Table.class); + when(table.schema()).thenReturn(tableSchema); + RecordConverter converter = new RecordConverter(table, config); + + Schema structSchema = + SchemaBuilder.struct().field("ii", Schema.INT64_SCHEMA).field("ff", Schema.FLOAT64_SCHEMA); + Schema schema = + SchemaBuilder.struct().field("i", Schema.INT32_SCHEMA).field("st", structSchema); + Struct structValue = new Struct(structSchema).put("ii", 11L).put("ff", 22d); + Struct data = new Struct(schema).put("i", 1).put("st", structValue); + + SchemaUpdate.Consumer consumer = new SchemaUpdate.Consumer(); + converter.convert(data, consumer); + Collection updates = consumer.updateTypes(); + + assertThat(updates).hasSize(2); + + Map updateMap = Maps.newHashMap(); + updates.forEach(update -> updateMap.put(update.name(), update)); + + assertThat(updateMap.get("st.ii").type()).isInstanceOf(LongType.class); + assertThat(updateMap.get("st.ff").type()).isInstanceOf(DoubleType.class); + } + + private Map createMapData() { + return ImmutableMap.builder() + .put("i", 1) + .put("l", 2L) + .put("d", DATE_VAL.toString()) + .put("t", TIME_VAL.toString()) + .put("ts", TS_VAL.toString()) + .put("tsz", TSZ_VAL.toString()) + .put("fl", 1.1f) + .put("do", 2.2d) + .put("dec", DEC_VAL.toString()) + .put("s", STR_VAL) + .put("u", UUID_VAL.toString()) + .put("f", Base64.getEncoder().encodeToString(BYTES_VAL.array())) + .put("b", Base64.getEncoder().encodeToString(BYTES_VAL.array())) + .put("li", LIST_VAL) + .put("ma", MAP_VAL) + .build(); + } + + private Map createNestedMapData() { + return ImmutableMap.builder().put("ii", 11).put("st", createMapData()).build(); + } + + private Struct createStructData() { + return new Struct(CONNECT_SCHEMA) + .put("i", 1) + .put("l", 2L) + .put("d", new Date(DATE_VAL.toEpochDay() * 24 * 60 * 60 * 1000L)) + .put("t", new Date(TIME_VAL.toNanoOfDay() / 1_000_000)) + .put("ts", Date.from(TS_VAL.atZone(ZoneOffset.UTC).toInstant())) + .put("tsz", Date.from(TSZ_VAL.toInstant())) + .put("fl", 1.1f) + .put("do", 2.2d) + .put("dec", DEC_VAL) + .put("s", STR_VAL) + .put("u", UUID_VAL.toString()) + .put("f", BYTES_VAL.array()) + .put("b", BYTES_VAL.array()) + .put("li", LIST_VAL) + .put("ma", MAP_VAL); + } + + private Struct createNestedStructData() { + return new Struct(CONNECT_NESTED_SCHEMA).put("ii", 11).put("st", createStructData()); + } + + private void assertRecordValues(Record record) { + GenericRecord rec = (GenericRecord) record; + assertThat(rec.getField("i")).isEqualTo(1); + assertThat(rec.getField("l")).isEqualTo(2L); + assertThat(rec.getField("d")).isEqualTo(DATE_VAL); + assertThat(rec.getField("t")).isEqualTo(TIME_VAL); + assertThat(rec.getField("ts")).isEqualTo(TS_VAL); + assertThat(rec.getField("tsz")).isEqualTo(TSZ_VAL); + assertThat(rec.getField("fl")).isEqualTo(1.1f); + assertThat(rec.getField("do")).isEqualTo(2.2d); + assertThat(rec.getField("dec")).isEqualTo(DEC_VAL); + assertThat(rec.getField("s")).isEqualTo(STR_VAL); + assertThat(rec.getField("u")).isEqualTo(UUID_VAL); + assertThat(rec.getField("f")).isEqualTo(BYTES_VAL); + assertThat(rec.getField("b")).isEqualTo(BYTES_VAL); + assertThat(rec.getField("li")).isEqualTo(LIST_VAL); + assertThat(rec.getField("ma")).isEqualTo(MAP_VAL); + } + + private void assertNestedRecordValues(Record record) { + GenericRecord rec = (GenericRecord) record; + assertThat(rec.getField("ii")).isEqualTo(11); + assertRecordValues((GenericRecord) rec.getField("st")); + } +} diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java new file mode 100644 index 000000000000..947d965bffcd --- /dev/null +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.connect.IcebergSinkConfig; +import org.apache.iceberg.connect.TableSinkConfig; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class UnpartitionedDeltaWriterTest extends BaseWriterTest { + + @ParameterizedTest + @ValueSource(strings = {"parquet", "orc"}) + public void testUnpartitionedDeltaWriter(String format) { + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.upsertModeEnabled()).thenReturn(true); + when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); + when(config.writeProps()).thenReturn(ImmutableMap.of("write.format.default", format)); + + Record row = GenericRecord.create(SCHEMA); + row.setField("id", 123L); + row.setField("data", "hello world!"); + row.setField("id2", 123L); + + WriteResult result = writeTest(ImmutableList.of(row), config, UnpartitionedDeltaWriter.class); + + // in upsert mode, each write is a delete + append, so we'll have 1 data file + // and 1 delete file + assertThat(result.dataFiles()).hasSize(1); + assertThat(result.dataFiles()).allMatch(file -> file.format() == FileFormat.fromString(format)); + assertThat(result.deleteFiles()).hasSize(1); + assertThat(result.deleteFiles()) + .allMatch(file -> file.format() == FileFormat.fromString(format)); + } +} From 610a731fe635b26a56ddef1acddada49544d4d3b Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Mon, 5 Feb 2024 07:04:38 -0800 Subject: [PATCH 02/15] duplicate comment --- .../main/java/org/apache/iceberg/connect/data/IcebergWriter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java index d0c61e016889..591c1cb01058 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java @@ -57,7 +57,6 @@ private void initNewWriter() { @Override public void write(SinkRecord record) { try { - // TODO: config to handle tombstones instead of always ignoring? // TODO: config to handle tombstones instead of always ignoring? if (record.value() != null) { Record row = convertToRow(record); From 51f5f3c2a5333728ba2cb6fc916886195696b02e Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Tue, 6 Feb 2024 09:48:58 -0800 Subject: [PATCH 03/15] add exception messages --- .../apache/iceberg/connect/data/RecordProjection.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java index 79ce2c111a3a..41972f2ccede 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java @@ -164,7 +164,7 @@ public T get(int pos, Class javaClass) { @Override public void set(int pos, T value) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("RecordProjection.set(int, Object) is not supported"); } @Override @@ -174,12 +174,13 @@ public StructType struct() { @Override public Object getField(String name) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("RecordProjection.getField(String) is not supported"); } @Override public void setField(String name, Object value) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException( + "RecordProjection.setField(String, Object) is not supported"); } @Override @@ -189,11 +190,11 @@ public Object get(int pos) { @Override public Record copy() { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("RecordProjection.copy() is not supported"); } @Override public Record copy(Map overwriteValues) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("RecordProjection.copy(Map) is not supported"); } } From 109ae1d372f9771c6a06d04f0b0aa5f89cab3281 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Tue, 6 Feb 2024 10:01:54 -0800 Subject: [PATCH 04/15] add boolean convert test assertions --- .../connect/data/RecordConverterTest.java | 101 ++++++++++-------- 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index a61e7324a778..90404205de0c 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -54,20 +54,23 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.BinaryType; +import org.apache.iceberg.types.Types.BooleanType; import org.apache.iceberg.types.Types.DateType; import org.apache.iceberg.types.Types.DecimalType; import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.FixedType; import org.apache.iceberg.types.Types.FloatType; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.LongType; import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.types.Types.TimeType; import org.apache.iceberg.types.Types.TimestampType; +import org.apache.iceberg.types.Types.UUIDType; import org.apache.kafka.connect.data.Decimal; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; @@ -89,59 +92,59 @@ public class RecordConverterTest { private static final org.apache.iceberg.Schema SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required(21, "i", IntegerType.get()), - Types.NestedField.required(22, "l", LongType.get()), - Types.NestedField.required(23, "d", DateType.get()), - Types.NestedField.required(24, "t", TimeType.get()), - Types.NestedField.required(25, "ts", TimestampType.withoutZone()), - Types.NestedField.required(26, "tsz", TimestampType.withZone()), - Types.NestedField.required(27, "fl", FloatType.get()), - Types.NestedField.required(28, "do", DoubleType.get()), - Types.NestedField.required(29, "dec", DecimalType.of(9, 2)), - Types.NestedField.required(30, "s", StringType.get()), - Types.NestedField.required(31, "u", Types.UUIDType.get()), - Types.NestedField.required(32, "f", Types.FixedType.ofLength(3)), - Types.NestedField.required(33, "b", BinaryType.get()), - Types.NestedField.required(34, "li", ListType.ofRequired(35, StringType.get())), - Types.NestedField.required( + NestedField.required(20, "i", IntegerType.get()), + NestedField.required(21, "l", LongType.get()), + NestedField.required(22, "d", DateType.get()), + NestedField.required(23, "t", TimeType.get()), + NestedField.required(24, "ts", TimestampType.withoutZone()), + NestedField.required(25, "tsz", TimestampType.withZone()), + NestedField.required(26, "fl", FloatType.get()), + NestedField.required(27, "do", DoubleType.get()), + NestedField.required(28, "dec", DecimalType.of(9, 2)), + NestedField.required(29, "s", StringType.get()), + NestedField.required(30, "b", BooleanType.get()), + NestedField.required(31, "u", UUIDType.get()), + NestedField.required(32, "f", FixedType.ofLength(3)), + NestedField.required(33, "bi", BinaryType.get()), + NestedField.required(34, "li", ListType.ofRequired(35, StringType.get())), + NestedField.required( 36, "ma", MapType.ofRequired(37, 38, StringType.get(), StringType.get())), - Types.NestedField.optional(39, "extra", StringType.get())); + NestedField.optional(39, "extra", StringType.get())); // we have 1 unmapped column so exclude that from the count private static final int MAPPED_CNT = SCHEMA.columns().size() - 1; private static final org.apache.iceberg.Schema NESTED_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required(1, "ii", IntegerType.get()), - Types.NestedField.required(2, "st", SCHEMA.asStruct())); + NestedField.required(1, "ii", IntegerType.get()), + NestedField.required(2, "st", SCHEMA.asStruct())); private static final org.apache.iceberg.Schema SIMPLE_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required(1, "ii", IntegerType.get()), - Types.NestedField.required(2, "st", StringType.get())); + NestedField.required(1, "ii", IntegerType.get()), + NestedField.required(2, "st", StringType.get())); private static final org.apache.iceberg.Schema ID_SCHEMA = - new org.apache.iceberg.Schema(Types.NestedField.required(1, "ii", IntegerType.get())); + new org.apache.iceberg.Schema(NestedField.required(1, "ii", IntegerType.get())); private static final org.apache.iceberg.Schema STRUCT_IN_LIST_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required( - 100, "stli", ListType.ofRequired(101, NESTED_SCHEMA.asStruct()))); + NestedField.required(100, "stli", ListType.ofRequired(101, NESTED_SCHEMA.asStruct()))); private static final org.apache.iceberg.Schema STRUCT_IN_LIST_BASIC_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required(100, "stli", ListType.ofRequired(101, ID_SCHEMA.asStruct()))); + NestedField.required(100, "stli", ListType.ofRequired(101, ID_SCHEMA.asStruct()))); private static final org.apache.iceberg.Schema STRUCT_IN_MAP_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required( + NestedField.required( 100, "stma", MapType.ofRequired(101, 102, StringType.get(), NESTED_SCHEMA.asStruct()))); private static final org.apache.iceberg.Schema STRUCT_IN_MAP_BASIC_SCHEMA = new org.apache.iceberg.Schema( - Types.NestedField.required( + NestedField.required( 100, "stma", MapType.ofRequired(101, 102, StringType.get(), ID_SCHEMA.asStruct()))); private static final Schema CONNECT_SCHEMA = @@ -156,9 +159,10 @@ public class RecordConverterTest { .field("do", Schema.FLOAT64_SCHEMA) .field("dec", Decimal.schema(2)) .field("s", Schema.STRING_SCHEMA) + .field("b", Schema.BOOLEAN_SCHEMA) .field("u", Schema.STRING_SCHEMA) .field("f", Schema.BYTES_SCHEMA) - .field("b", Schema.BYTES_SCHEMA) + .field("bi", Schema.BYTES_SCHEMA) .field("li", SchemaBuilder.array(Schema.STRING_SCHEMA)) .field("ma", SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA)); @@ -509,7 +513,7 @@ public void testMissingColumnDetectionMap() { converter.convert(data, consumer); Collection addCols = consumer.addColumns(); - assertThat(addCols).hasSize(15); + assertThat(addCols).hasSize(MAPPED_CNT); Map newColMap = Maps.newHashMap(); addCols.forEach(addCol -> newColMap.put(addCol.name(), addCol)); @@ -537,7 +541,7 @@ public void testMissingColumnDetectionMapNested() { assertThat(addCol.name()).isEqualTo("st"); StructType addedType = addCol.type().asStructType(); - assertThat(addedType.fields()).hasSize(15); + assertThat(addedType.fields()).hasSize(MAPPED_CNT); assertTypesAddedFromMap(col -> addedType.field(col).type()); } @@ -560,7 +564,7 @@ public void testMissingColumnDetectionMapListValue() { assertThat(addCol.name()).isEqualTo("st"); StructType nestedElementType = addCol.type().asStructType(); - assertThat(nestedElementType.fields()).hasSize(15); + assertThat(nestedElementType.fields()).hasSize(MAPPED_CNT); assertTypesAddedFromMap(col -> nestedElementType.field(col).type()); } @@ -575,9 +579,10 @@ private void assertTypesAddedFromMap(Function fn) { assertThat(fn.apply("do")).isInstanceOf(DoubleType.class); assertThat(fn.apply("dec")).isInstanceOf(StringType.class); assertThat(fn.apply("s")).isInstanceOf(StringType.class); + assertThat(fn.apply("b")).isInstanceOf(BooleanType.class); assertThat(fn.apply("u")).isInstanceOf(StringType.class); assertThat(fn.apply("f")).isInstanceOf(StringType.class); - assertThat(fn.apply("b")).isInstanceOf(StringType.class); + assertThat(fn.apply("bi")).isInstanceOf(StringType.class); assertThat(fn.apply("li")).isInstanceOf(ListType.class); assertThat(fn.apply("ma")).isInstanceOf(StructType.class); } @@ -593,7 +598,7 @@ public void testMissingColumnDetectionStruct() { converter.convert(data, consumer); Collection addCols = consumer.addColumns(); - assertThat(addCols).hasSize(15); + assertThat(addCols).hasSize(MAPPED_CNT); Map newColMap = Maps.newHashMap(); addCols.forEach(addCol -> newColMap.put(addCol.name(), addCol)); @@ -618,7 +623,7 @@ public void testMissingColumnDetectionStructNested() { assertThat(addCol.name()).isEqualTo("st"); StructType addedType = addCol.type().asStructType(); - assertThat(addedType.fields()).hasSize(15); + assertThat(addedType.fields()).hasSize(MAPPED_CNT); assertTypesAddedFromStruct(col -> addedType.field(col).type()); } @@ -643,7 +648,7 @@ public void testMissingColumnDetectionStructListValue() { assertThat(addCol.name()).isEqualTo("st"); StructType nestedElementType = addCol.type().asStructType(); - assertThat(nestedElementType.fields()).hasSize(15); + assertThat(nestedElementType.fields()).hasSize(MAPPED_CNT); assertTypesAddedFromStruct(col -> nestedElementType.field(col).type()); } @@ -668,7 +673,7 @@ public void testMissingColumnDetectionStructMapValue() { assertThat(addCol.name()).isEqualTo("st"); StructType nestedValueType = addCol.type().asStructType(); - assertThat(nestedValueType.fields()).hasSize(15); + assertThat(nestedValueType.fields()).hasSize(MAPPED_CNT); assertTypesAddedFromStruct(col -> nestedValueType.field(col).type()); } @@ -683,9 +688,10 @@ private void assertTypesAddedFromStruct(Function fn) { assertThat(fn.apply("do")).isInstanceOf(DoubleType.class); assertThat(fn.apply("dec")).isInstanceOf(DecimalType.class); assertThat(fn.apply("s")).isInstanceOf(StringType.class); + assertThat(fn.apply("b")).isInstanceOf(BooleanType.class); assertThat(fn.apply("u")).isInstanceOf(StringType.class); assertThat(fn.apply("f")).isInstanceOf(BinaryType.class); - assertThat(fn.apply("b")).isInstanceOf(BinaryType.class); + assertThat(fn.apply("bi")).isInstanceOf(BinaryType.class); assertThat(fn.apply("li")).isInstanceOf(ListType.class); assertThat(fn.apply("ma")).isInstanceOf(MapType.class); } @@ -694,8 +700,8 @@ private void assertTypesAddedFromStruct(Function fn) { public void testEvolveTypeDetectionStruct() { org.apache.iceberg.Schema tableSchema = new org.apache.iceberg.Schema( - Types.NestedField.required(1, "ii", IntegerType.get()), - Types.NestedField.required(2, "ff", FloatType.get())); + NestedField.required(1, "ii", IntegerType.get()), + NestedField.required(2, "ff", FloatType.get())); Table table = mock(Table.class); when(table.schema()).thenReturn(tableSchema); @@ -722,13 +728,13 @@ public void testEvolveTypeDetectionStruct() { public void testEvolveTypeDetectionStructNested() { org.apache.iceberg.Schema structColSchema = new org.apache.iceberg.Schema( - Types.NestedField.required(1, "ii", IntegerType.get()), - Types.NestedField.required(2, "ff", FloatType.get())); + NestedField.required(1, "ii", IntegerType.get()), + NestedField.required(2, "ff", FloatType.get())); org.apache.iceberg.Schema tableSchema = new org.apache.iceberg.Schema( - Types.NestedField.required(3, "i", IntegerType.get()), - Types.NestedField.required(4, "st", structColSchema.asStruct())); + NestedField.required(3, "i", IntegerType.get()), + NestedField.required(4, "st", structColSchema.asStruct())); Table table = mock(Table.class); when(table.schema()).thenReturn(tableSchema); @@ -766,9 +772,10 @@ private Map createMapData() { .put("do", 2.2d) .put("dec", DEC_VAL.toString()) .put("s", STR_VAL) + .put("b", true) .put("u", UUID_VAL.toString()) .put("f", Base64.getEncoder().encodeToString(BYTES_VAL.array())) - .put("b", Base64.getEncoder().encodeToString(BYTES_VAL.array())) + .put("bi", Base64.getEncoder().encodeToString(BYTES_VAL.array())) .put("li", LIST_VAL) .put("ma", MAP_VAL) .build(); @@ -790,9 +797,10 @@ private Struct createStructData() { .put("do", 2.2d) .put("dec", DEC_VAL) .put("s", STR_VAL) + .put("b", true) .put("u", UUID_VAL.toString()) .put("f", BYTES_VAL.array()) - .put("b", BYTES_VAL.array()) + .put("bi", BYTES_VAL.array()) .put("li", LIST_VAL) .put("ma", MAP_VAL); } @@ -813,9 +821,10 @@ private void assertRecordValues(Record record) { assertThat(rec.getField("do")).isEqualTo(2.2d); assertThat(rec.getField("dec")).isEqualTo(DEC_VAL); assertThat(rec.getField("s")).isEqualTo(STR_VAL); + assertThat(rec.getField("b")).isEqualTo(true); assertThat(rec.getField("u")).isEqualTo(UUID_VAL); assertThat(rec.getField("f")).isEqualTo(BYTES_VAL); - assertThat(rec.getField("b")).isEqualTo(BYTES_VAL); + assertThat(rec.getField("bi")).isEqualTo(BYTES_VAL); assertThat(rec.getField("li")).isEqualTo(LIST_VAL); assertThat(rec.getField("ma")).isEqualTo(MAP_VAL); } From 4b052b93106715417d0b49cd5ba4287ec699398e Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Tue, 6 Feb 2024 10:04:06 -0800 Subject: [PATCH 05/15] remove static block --- .../apache/iceberg/connect/data/RecordConverterTest.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index 90404205de0c..bc2e4df67ec6 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -81,6 +81,7 @@ import org.apache.kafka.connect.json.JsonConverterConfig; import org.apache.kafka.connect.storage.ConverterConfig; import org.apache.kafka.connect.storage.ConverterType; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -190,7 +191,10 @@ public class RecordConverterTest { private static final JsonConverter JSON_CONVERTER = new JsonConverter(); - static { + private IcebergSinkConfig config; + + @BeforeAll + public static void beforeAll() { JSON_CONVERTER.configure( ImmutableMap.of( JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, @@ -199,8 +203,6 @@ public class RecordConverterTest { ConverterType.VALUE.getName())); } - private IcebergSinkConfig config; - @BeforeEach public void before() { this.config = mock(IcebergSinkConfig.class); From 4e8f03799ddf02511598ff65c12ea035b2cc961f Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Wed, 7 Feb 2024 08:38:02 -0800 Subject: [PATCH 06/15] remove unneeded TODO comments --- .../java/org/apache/iceberg/connect/data/IcebergWriter.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java index 591c1cb01058..8c8ef9e67332 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java @@ -57,7 +57,7 @@ private void initNewWriter() { @Override public void write(SinkRecord record) { try { - // TODO: config to handle tombstones instead of always ignoring? + // ignore tombstones... if (record.value() != null) { Record row = convertToRow(record); String cdcField = config.tablesCdcField(); @@ -111,8 +111,6 @@ private Operation extractCdcOperation(Object recordValue, String cdcField) { return Operation.INSERT; } - // TODO: define value mapping in config? - switch (opStr.charAt(0)) { case 'U': return Operation.UPDATE; From 9d61eff6ea0c3a1a0b56cf7cf4b3bd2a7f482e61 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Mon, 12 Feb 2024 09:45:08 -0800 Subject: [PATCH 07/15] add more tests for number conversions --- .../connect/data/RecordConverterTest.java | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index bc2e4df67ec6..180b02166958 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -389,6 +389,74 @@ public void testCaseSensitivity(boolean caseInsensitive) { } } + @Test + public void testIntConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + RecordConverter converter = new RecordConverter(table, config); + + int expectedInt = 123; + + ImmutableList.of("123", 123.0f, 123.0d, 123L, expectedInt) + .forEach( + input -> { + int i = converter.convertInt(input); + assertThat(i).isEqualTo(expectedInt); + }); + } + + @Test + public void testLongConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + RecordConverter converter = new RecordConverter(table, config); + + long expectedLong = 123L; + + ImmutableList.of("123", 123.0f, 123.0d, 123, expectedLong) + .forEach( + input -> { + long l = converter.convertLong(input); + assertThat(l).isEqualTo(expectedLong); + }); + } + + @Test + public void testFloatConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + RecordConverter converter = new RecordConverter(table, config); + + float expectedFloat = 123f; + + ImmutableList.of("123", 123, 123L, 123d, expectedFloat) + .forEach( + input -> { + float f = converter.convertFloat(input); + assertThat(f).isEqualTo(expectedFloat); + }); + } + + @Test + public void testDoubleConversion() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + + RecordConverter converter = new RecordConverter(table, config); + + double expectedDouble = 123d; + + ImmutableList.of("123", 123, 123L, 123f, expectedDouble) + .forEach( + input -> { + double d = converter.convertDouble(input); + assertThat(d).isEqualTo(expectedDouble); + }); + } + @Test public void testDecimalConversion() { Table table = mock(Table.class); From fa0340298f46a095d5a31fae82922b69531984c7 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Mon, 12 Feb 2024 09:47:59 -0800 Subject: [PATCH 08/15] checkstyle --- .../connect/data/RecordConverterTest.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index 180b02166958..fc27b542dc68 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -401,8 +401,8 @@ public void testIntConversion() { ImmutableList.of("123", 123.0f, 123.0d, 123L, expectedInt) .forEach( input -> { - int i = converter.convertInt(input); - assertThat(i).isEqualTo(expectedInt); + int val = converter.convertInt(input); + assertThat(val).isEqualTo(expectedInt); }); } @@ -418,8 +418,8 @@ public void testLongConversion() { ImmutableList.of("123", 123.0f, 123.0d, 123, expectedLong) .forEach( input -> { - long l = converter.convertLong(input); - assertThat(l).isEqualTo(expectedLong); + long val = converter.convertLong(input); + assertThat(val).isEqualTo(expectedLong); }); } @@ -435,8 +435,8 @@ public void testFloatConversion() { ImmutableList.of("123", 123, 123L, 123d, expectedFloat) .forEach( input -> { - float f = converter.convertFloat(input); - assertThat(f).isEqualTo(expectedFloat); + float val = converter.convertFloat(input); + assertThat(val).isEqualTo(expectedFloat); }); } @@ -452,8 +452,8 @@ public void testDoubleConversion() { ImmutableList.of("123", 123, 123L, 123f, expectedDouble) .forEach( input -> { - double d = converter.convertDouble(input); - assertThat(d).isEqualTo(expectedDouble); + double val = converter.convertDouble(input); + assertThat(val).isEqualTo(expectedDouble); }); } From 48e11023dca4da666f1f4747b28247d692060f47 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Wed, 28 Feb 2024 16:30:30 -0800 Subject: [PATCH 09/15] remove delta writers --- .../iceberg/connect/IcebergSinkConfig.java | 23 -- .../connect/data/BaseDeltaTaskWriter.java | 102 --------- .../iceberg/connect/data/IcebergWriter.java | 30 +-- .../iceberg/connect/data/Operation.java | 25 --- .../connect/data/PartitionedDeltaWriter.java | 93 -------- .../connect/data/RecordProjection.java | 200 ------------------ .../iceberg/connect/data/RecordWrapper.java | 83 -------- .../data/UnpartitionedDeltaWriter.java | 66 ------ .../iceberg/connect/data/Utilities.java | 52 ++--- .../data/PartitionedDeltaWriterTest.java | 70 ------ .../data/UnpartitionedDeltaWriterTest.java | 62 ------ 11 files changed, 13 insertions(+), 793 deletions(-) delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java delete mode 100644 kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java delete mode 100644 kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java delete mode 100644 kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java index dd9f9e401446..d1572fbff37b 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java @@ -71,9 +71,6 @@ public class IcebergSinkConfig extends AbstractConfig { private static final String TABLES_DEFAULT_COMMIT_BRANCH = "iceberg.tables.default-commit-branch"; private static final String TABLES_DEFAULT_ID_COLUMNS = "iceberg.tables.default-id-columns"; private static final String TABLES_DEFAULT_PARTITION_BY = "iceberg.tables.default-partition-by"; - private static final String TABLES_CDC_FIELD_PROP = "iceberg.tables.cdc-field"; - private static final String TABLES_UPSERT_MODE_ENABLED_PROP = - "iceberg.tables.upsert-mode-enabled"; private static final String TABLES_AUTO_CREATE_ENABLED_PROP = "iceberg.tables.auto-create-enabled"; private static final String TABLES_EVOLVE_SCHEMA_ENABLED_PROP = @@ -152,18 +149,6 @@ private static ConfigDef newConfigDef() { null, Importance.MEDIUM, "Default partition spec to use when creating tables, comma-separated"); - configDef.define( - TABLES_CDC_FIELD_PROP, - ConfigDef.Type.STRING, - null, - Importance.MEDIUM, - "Source record field that identifies the type of operation (insert, update, or delete)"); - configDef.define( - TABLES_UPSERT_MODE_ENABLED_PROP, - ConfigDef.Type.BOOLEAN, - false, - Importance.MEDIUM, - "Set to true to treat all appends as upserts, false otherwise"); configDef.define( TABLES_AUTO_CREATE_ENABLED_PROP, ConfigDef.Type.BOOLEAN, @@ -422,14 +407,6 @@ public String hadoopConfDir() { return getString(HADDOP_CONF_DIR_PROP); } - public String tablesCdcField() { - return getString(TABLES_CDC_FIELD_PROP); - } - - public boolean upsertModeEnabled() { - return getBoolean(TABLES_UPSERT_MODE_ENABLED_PROP); - } - public boolean autoCreateEnabled() { return getBoolean(TABLES_AUTO_CREATE_ENABLED_PROP); } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java deleted file mode 100644 index 09cedea693f1..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/BaseDeltaTaskWriter.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import java.io.IOException; -import java.util.Set; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; - -abstract class BaseDeltaTaskWriter extends BaseTaskWriter { - - private final Schema schema; - private final Schema deleteSchema; - private final InternalRecordWrapper wrapper; - private final InternalRecordWrapper keyWrapper; - private final RecordProjection keyProjection; - private final boolean upsertMode; - - BaseDeltaTaskWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - Set identifierFieldIds, - boolean upsertMode) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.schema = schema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(identifierFieldIds)); - this.wrapper = new InternalRecordWrapper(schema.asStruct()); - this.keyWrapper = new InternalRecordWrapper(deleteSchema.asStruct()); - this.keyProjection = RecordProjection.create(schema, deleteSchema); - this.upsertMode = upsertMode; - } - - abstract RowDataDeltaWriter route(Record row); - - InternalRecordWrapper wrapper() { - return wrapper; - } - - @Override - public void write(Record row) throws IOException { - Operation op = - row instanceof RecordWrapper - ? ((RecordWrapper) row).op() - : upsertMode ? Operation.UPDATE : Operation.INSERT; - RowDataDeltaWriter writer = route(row); - if (op == Operation.UPDATE || op == Operation.DELETE) { - writer.deleteKey(keyProjection.wrap(row)); - } - if (op == Operation.UPDATE || op == Operation.INSERT) { - writer.write(row); - } - } - - class RowDataDeltaWriter extends BaseEqualityDeltaWriter { - - RowDataDeltaWriter(PartitionKey partition) { - super(partition, schema, deleteSchema); - } - - @Override - protected StructLike asStructLike(Record data) { - return wrapper.wrap(data); - } - - @Override - protected StructLike asStructLikeKey(Record data) { - return keyWrapper.wrap(data); - } - } -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java index 8c8ef9e67332..27ffc4de9973 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/IcebergWriter.java @@ -60,13 +60,7 @@ public void write(SinkRecord record) { // ignore tombstones... if (record.value() != null) { Record row = convertToRow(record); - String cdcField = config.tablesCdcField(); - if (cdcField == null) { - writer.write(row); - } else { - Operation op = extractCdcOperation(record.value(), cdcField); - writer.write(new RecordWrapper(row, op)); - } + writer.write(row); } } catch (Exception e) { throw new DataException( @@ -99,28 +93,6 @@ private Record convertToRow(SinkRecord record) { return row; } - private Operation extractCdcOperation(Object recordValue, String cdcField) { - Object opValue = Utilities.extractFromRecordValue(recordValue, cdcField); - - if (opValue == null) { - return Operation.INSERT; - } - - String opStr = opValue.toString().trim().toUpperCase(); - if (opStr.isEmpty()) { - return Operation.INSERT; - } - - switch (opStr.charAt(0)) { - case 'U': - return Operation.UPDATE; - case 'D': - return Operation.DELETE; - default: - return Operation.INSERT; - } - } - private void flush() { WriteResult writeResult; try { diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java deleted file mode 100644 index 7f428a6dc2b6..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Operation.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -public enum Operation { - INSERT, - UPDATE, - DELETE -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java deleted file mode 100644 index eae5d116c6c1..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedDeltaWriter.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Map; -import java.util.Set; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Tasks; - -public class PartitionedDeltaWriter extends BaseDeltaTaskWriter { - private final PartitionKey partitionKey; - - private final Map writers = Maps.newHashMap(); - - PartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - Set identifierFieldIds, - boolean upsertMode) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - identifierFieldIds, - upsertMode); - this.partitionKey = new PartitionKey(spec, schema); - } - - @Override - RowDataDeltaWriter route(Record row) { - partitionKey.partition(wrapper().wrap(row)); - - RowDataDeltaWriter writer = writers.get(partitionKey); - if (writer == null) { - // NOTE: pass a copy of the partition key to the writer to prevent it from - // being modified - PartitionKey copiedKey = partitionKey.copy(); - writer = new RowDataDeltaWriter(copiedKey); - writers.put(copiedKey, writer); - } - - return writer; - } - - @Override - public void close() { - try { - Tasks.foreach(writers.values()) - .throwFailureWhenFinished() - .noRetry() - .run(RowDataDeltaWriter::close, IOException.class); - - writers.clear(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close equality delta writer", e); - } - } -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java deleted file mode 100644 index 41972f2ccede..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordProjection.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import java.util.List; -import java.util.Map; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.types.Types.StructType; - -/** - * This is modified from {@link org.apache.iceberg.util.StructProjection} to support record types. - */ -public class RecordProjection implements Record { - - /** - * Creates a projecting wrapper for {@link Record} rows. - * - *

This projection does not work with repeated types like lists and maps. - * - * @param dataSchema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RecordProjection create(Schema dataSchema, Schema projectedSchema) { - return new RecordProjection(dataSchema.asStruct(), projectedSchema.asStruct()); - } - - private final StructType type; - private final int[] positionMap; - private final RecordProjection[] nestedProjections; - private Record record; - - private RecordProjection(StructType structType, StructType projection) { - this(structType, projection, false); - } - - @SuppressWarnings("checkstyle:CyclomaticComplexity") - private RecordProjection(StructType structType, StructType projection, boolean allowMissing) { - this.type = projection; - this.positionMap = new int[projection.fields().size()]; - this.nestedProjections = new RecordProjection[projection.fields().size()]; - - // set up the projection positions and any nested projections that are needed - List dataFields = structType.fields(); - for (int pos = 0; pos < positionMap.length; pos += 1) { - NestedField projectedField = projection.fields().get(pos); - - boolean found = false; - for (int i = 0; !found && i < dataFields.size(); i += 1) { - NestedField dataField = dataFields.get(i); - if (projectedField.fieldId() == dataField.fieldId()) { - found = true; - positionMap[pos] = i; - switch (projectedField.type().typeId()) { - case STRUCT: - nestedProjections[pos] = - new RecordProjection( - dataField.type().asStructType(), projectedField.type().asStructType()); - break; - case MAP: - MapType projectedMap = projectedField.type().asMapType(); - MapType originalMap = dataField.type().asMapType(); - - boolean keyProjectable = - !projectedMap.keyType().isNestedType() - || projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = - !projectedMap.valueType().isNestedType() - || projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument( - keyProjectable && valueProjectable, - "Cannot project a partial map key or value struct. Trying to project %s out of %s", - projectedField, - dataField); - - nestedProjections[pos] = null; - break; - case LIST: - ListType projectedList = projectedField.type().asListType(); - ListType originalList = dataField.type().asListType(); - - boolean elementProjectable = - !projectedList.elementType().isNestedType() - || projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument( - elementProjectable, - "Cannot project a partial list element struct. Trying to project %s out of %s", - projectedField, - dataField); - - nestedProjections[pos] = null; - break; - default: - nestedProjections[pos] = null; - } - } - } - - if (!found && projectedField.isOptional() && allowMissing) { - positionMap[pos] = -1; - nestedProjections[pos] = null; - } else if (!found) { - throw new IllegalArgumentException( - String.format("Cannot find field %s in %s", projectedField, structType)); - } - } - } - - public RecordProjection wrap(Record newRecord) { - this.record = newRecord; - return this; - } - - @Override - public int size() { - return type.fields().size(); - } - - @Override - public T get(int pos, Class javaClass) { - // struct can be null if wrap is not called first before the get call - // or if a null struct is wrapped. - if (record == null) { - return null; - } - - int recordPos = positionMap[pos]; - if (nestedProjections[pos] != null) { - Record nestedStruct = record.get(recordPos, Record.class); - if (nestedStruct == null) { - return null; - } - - return javaClass.cast(nestedProjections[pos].wrap(nestedStruct)); - } - - if (recordPos != -1) { - return record.get(recordPos, javaClass); - } else { - return null; - } - } - - @Override - public void set(int pos, T value) { - throw new UnsupportedOperationException("RecordProjection.set(int, Object) is not supported"); - } - - @Override - public StructType struct() { - return type; - } - - @Override - public Object getField(String name) { - throw new UnsupportedOperationException("RecordProjection.getField(String) is not supported"); - } - - @Override - public void setField(String name, Object value) { - throw new UnsupportedOperationException( - "RecordProjection.setField(String, Object) is not supported"); - } - - @Override - public Object get(int pos) { - return get(pos, Object.class); - } - - @Override - public Record copy() { - throw new UnsupportedOperationException("RecordProjection.copy() is not supported"); - } - - @Override - public Record copy(Map overwriteValues) { - throw new UnsupportedOperationException("RecordProjection.copy(Map) is not supported"); - } -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java deleted file mode 100644 index 915608562034..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordWrapper.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import java.util.Map; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.types.Types.StructType; - -public class RecordWrapper implements Record { - - private final Record delegate; - private final Operation op; - - public RecordWrapper(Record delegate, Operation op) { - this.delegate = delegate; - this.op = op; - } - - public Operation op() { - return op; - } - - @Override - public StructType struct() { - return delegate.struct(); - } - - @Override - public Object getField(String name) { - return delegate.getField(name); - } - - @Override - public void setField(String name, Object value) { - delegate.setField(name, value); - } - - @Override - public Object get(int pos) { - return delegate.get(pos); - } - - @Override - public Record copy() { - return new RecordWrapper(delegate.copy(), op); - } - - @Override - public Record copy(Map overwriteValues) { - return new RecordWrapper(delegate.copy(overwriteValues), op); - } - - @Override - public int size() { - return delegate.size(); - } - - @Override - public T get(int pos, Class javaClass) { - return delegate.get(pos, javaClass); - } - - @Override - public void set(int pos, T value) { - delegate.set(pos, value); - } -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java deleted file mode 100644 index 46b0a45d532d..000000000000 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriter.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import java.io.IOException; -import java.util.Set; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; - -public class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { - private final RowDataDeltaWriter writer; - - UnpartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - Set identifierFieldIds, - boolean upsertMode) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - identifierFieldIds, - upsertMode); - this.writer = new RowDataDeltaWriter(null); - } - - @Override - RowDataDeltaWriter route(Record row) { - return writer; - } - - @Override - public void close() throws IOException { - writer.close(); - } -} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java index 2de58d0d7486..4ff83f777527 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/Utilities.java @@ -226,47 +226,19 @@ public static TaskWriter createTableWriter( TaskWriter writer; if (table.spec().isUnpartitioned()) { - if (config.tablesCdcField() == null && !config.upsertModeEnabled()) { - writer = - new UnpartitionedWriter<>( - table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); - } else { - writer = - new UnpartitionedDeltaWriter( - table.spec(), - format, - appenderFactory, - fileFactory, - table.io(), - targetFileSize, - table.schema(), - identifierFieldIds, - config.upsertModeEnabled()); - } + writer = + new UnpartitionedWriter<>( + table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); } else { - if (config.tablesCdcField() == null && !config.upsertModeEnabled()) { - writer = - new PartitionedAppendWriter( - table.spec(), - format, - appenderFactory, - fileFactory, - table.io(), - targetFileSize, - table.schema()); - } else { - writer = - new PartitionedDeltaWriter( - table.spec(), - format, - appenderFactory, - fileFactory, - table.io(), - targetFileSize, - table.schema(), - identifierFieldIds, - config.upsertModeEnabled()); - } + writer = + new PartitionedAppendWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema()); } return writer; } diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java deleted file mode 100644 index b8b7f0b66f95..000000000000 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/PartitionedDeltaWriterTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.connect.IcebergSinkConfig; -import org.apache.iceberg.connect.TableSinkConfig; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class PartitionedDeltaWriterTest extends BaseWriterTest { - - @ParameterizedTest - @ValueSource(strings = {"parquet", "orc"}) - public void testPartitionedDeltaWriter(String format) { - IcebergSinkConfig config = mock(IcebergSinkConfig.class); - when(config.upsertModeEnabled()).thenReturn(true); - when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); - when(config.writeProps()).thenReturn(ImmutableMap.of("write.format.default", format)); - - when(table.spec()).thenReturn(SPEC); - - Record row1 = GenericRecord.create(SCHEMA); - row1.setField("id", 123L); - row1.setField("data", "hello world!"); - row1.setField("id2", 123L); - - Record row2 = GenericRecord.create(SCHEMA); - row2.setField("id", 234L); - row2.setField("data", "foobar"); - row2.setField("id2", 234L); - - WriteResult result = - writeTest(ImmutableList.of(row1, row2), config, PartitionedDeltaWriter.class); - - // in upsert mode, each write is a delete + append, so we'll have 1 data file - // and 1 delete file for each partition (2 total) - assertThat(result.dataFiles()).hasSize(2); - assertThat(result.dataFiles()).allMatch(file -> file.format() == FileFormat.fromString(format)); - assertThat(result.deleteFiles()).hasSize(2); - assertThat(result.deleteFiles()) - .allMatch(file -> file.format() == FileFormat.fromString(format)); - } -} diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java deleted file mode 100644 index 947d965bffcd..000000000000 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/UnpartitionedDeltaWriterTest.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.connect.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.connect.IcebergSinkConfig; -import org.apache.iceberg.connect.TableSinkConfig; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class UnpartitionedDeltaWriterTest extends BaseWriterTest { - - @ParameterizedTest - @ValueSource(strings = {"parquet", "orc"}) - public void testUnpartitionedDeltaWriter(String format) { - IcebergSinkConfig config = mock(IcebergSinkConfig.class); - when(config.upsertModeEnabled()).thenReturn(true); - when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); - when(config.writeProps()).thenReturn(ImmutableMap.of("write.format.default", format)); - - Record row = GenericRecord.create(SCHEMA); - row.setField("id", 123L); - row.setField("data", "hello world!"); - row.setField("id2", 123L); - - WriteResult result = writeTest(ImmutableList.of(row), config, UnpartitionedDeltaWriter.class); - - // in upsert mode, each write is a delete + append, so we'll have 1 data file - // and 1 delete file - assertThat(result.dataFiles()).hasSize(1); - assertThat(result.dataFiles()).allMatch(file -> file.format() == FileFormat.fromString(format)); - assertThat(result.deleteFiles()).hasSize(1); - assertThat(result.deleteFiles()) - .allMatch(file -> file.format() == FileFormat.fromString(format)); - } -} From 86f861784eb49ebff603332772fb8b74f29d62d0 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Wed, 28 Feb 2024 17:06:50 -0800 Subject: [PATCH 10/15] class visibility --- .../main/java/org/apache/iceberg/connect/data/NoOpWriter.java | 2 +- .../apache/iceberg/connect/data/PartitionedAppendWriter.java | 2 +- .../java/org/apache/iceberg/connect/data/RecordConverter.java | 2 +- .../main/java/org/apache/iceberg/connect/data/SchemaUpdate.java | 2 +- .../main/java/org/apache/iceberg/connect/data/SchemaUtils.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/NoOpWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/NoOpWriter.java index 31abe09cf1a4..64ca44f03209 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/NoOpWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/NoOpWriter.java @@ -21,7 +21,7 @@ import java.util.List; import org.apache.kafka.connect.sink.SinkRecord; -public class NoOpWriter implements RecordWriter { +class NoOpWriter implements RecordWriter { @Override public void write(SinkRecord record) { // NO-OP diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java index 1d429e44e675..30dc347e3015 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java @@ -29,7 +29,7 @@ import org.apache.iceberg.io.OutputFileFactory; import org.apache.iceberg.io.PartitionedFanoutWriter; -public class PartitionedAppendWriter extends PartitionedFanoutWriter { +class PartitionedAppendWriter extends PartitionedFanoutWriter { private final PartitionKey partitionKey; private final InternalRecordWrapper wrapper; diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 7c6ae76e9b49..eee44b09c8c2 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -62,7 +62,7 @@ import org.apache.iceberg.util.DateTimeUtil; import org.apache.kafka.connect.data.Struct; -public class RecordConverter { +class RecordConverter { private static final ObjectMapper MAPPER = new ObjectMapper(); diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java index 2bb0e65f204b..1ed57b58b89a 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java @@ -24,7 +24,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Type.PrimitiveType; -public class SchemaUpdate { +class SchemaUpdate { public static class Consumer { private final Map addColumns = Maps.newHashMap(); diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java index 64fa89041c29..bf5021e85252 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java @@ -65,7 +65,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class SchemaUtils { +class SchemaUtils { private static final Logger LOG = LoggerFactory.getLogger(SchemaUtils.class); From d1c436017acc32dd57ed8365639fc9e0061b2857 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Wed, 28 Feb 2024 17:26:47 -0800 Subject: [PATCH 11/15] variable name change --- .../org/apache/iceberg/connect/data/RecordConverter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index eee44b09c8c2..8b24dc993609 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -66,7 +66,7 @@ class RecordConverter { private static final ObjectMapper MAPPER = new ObjectMapper(); - private static final DateTimeFormatter OFFSET_TS_FMT = + private static final DateTimeFormatter OFFSET_TIMESTAMP_FORMAT = new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE_TIME) .appendOffset("+HHmm", "Z") @@ -461,7 +461,7 @@ private OffsetDateTime convertOffsetDateTime(Object value) { private OffsetDateTime parseOffsetDateTime(String str) { String tsStr = ensureTimestampFormat(str); try { - return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from); + return OFFSET_TIMESTAMP_FORMAT.parse(tsStr, OffsetDateTime::from); } catch (DateTimeParseException e) { return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME) .atOffset(ZoneOffset.UTC); @@ -491,7 +491,7 @@ private LocalDateTime parseLocalDateTime(String str) { try { return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME); } catch (DateTimeParseException e) { - return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from).toLocalDateTime(); + return OFFSET_TIMESTAMP_FORMAT.parse(tsStr, OffsetDateTime::from).toLocalDateTime(); } } From dd93b040972dbff24bdd79b0919de8143414952d Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Wed, 28 Feb 2024 17:37:01 -0800 Subject: [PATCH 12/15] change method visibilities --- .../connect/data/PartitionedAppendWriter.java | 2 +- .../iceberg/connect/data/RecordConverter.java | 6 +-- .../iceberg/connect/data/SchemaUpdate.java | 42 +++++++++---------- .../iceberg/connect/data/SchemaUtils.java | 12 +++--- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java index 30dc347e3015..ad8b5715a99b 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/PartitionedAppendWriter.java @@ -34,7 +34,7 @@ class PartitionedAppendWriter extends PartitionedFanoutWriter { private final PartitionKey partitionKey; private final InternalRecordWrapper wrapper; - public PartitionedAppendWriter( + PartitionedAppendWriter( PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 8b24dc993609..7e5d9cea0ea0 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -77,17 +77,17 @@ class RecordConverter { private final IcebergSinkConfig config; private final Map> structNameMap = Maps.newHashMap(); - public RecordConverter(Table table, IcebergSinkConfig config) { + RecordConverter(Table table, IcebergSinkConfig config) { this.tableSchema = table.schema(); this.nameMapping = createNameMapping(table); this.config = config; } - public Record convert(Object data) { + Record convert(Object data) { return convert(data, null); } - public Record convert(Object data, SchemaUpdate.Consumer schemaUpdateConsumer) { + Record convert(Object data, SchemaUpdate.Consumer schemaUpdateConsumer) { if (data instanceof Struct || data instanceof Map) { return convertStructValue(data, tableSchema.asStruct(), -1, schemaUpdateConsumer); } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java index 1ed57b58b89a..809bea84dcc2 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUpdate.java @@ -26,95 +26,95 @@ class SchemaUpdate { - public static class Consumer { + static class Consumer { private final Map addColumns = Maps.newHashMap(); private final Map updateTypes = Maps.newHashMap(); private final Map makeOptionals = Maps.newHashMap(); - public Collection addColumns() { + Collection addColumns() { return addColumns.values(); } - public Collection updateTypes() { + Collection updateTypes() { return updateTypes.values(); } - public Collection makeOptionals() { + Collection makeOptionals() { return makeOptionals.values(); } - public boolean empty() { + boolean empty() { return addColumns.isEmpty() && updateTypes.isEmpty() && makeOptionals.isEmpty(); } - public void addColumn(String parentName, String name, Type type) { + void addColumn(String parentName, String name, Type type) { AddColumn addCol = new AddColumn(parentName, name, type); addColumns.put(addCol.key(), addCol); } - public void updateType(String name, PrimitiveType type) { + void updateType(String name, PrimitiveType type) { updateTypes.put(name, new UpdateType(name, type)); } - public void makeOptional(String name) { + void makeOptional(String name) { makeOptionals.put(name, new MakeOptional(name)); } } - public static class AddColumn extends SchemaUpdate { + static class AddColumn extends SchemaUpdate { private final String parentName; private final String name; private final Type type; - public AddColumn(String parentName, String name, Type type) { + AddColumn(String parentName, String name, Type type) { this.parentName = parentName; this.name = name; this.type = type; } - public String parentName() { + String parentName() { return parentName; } - public String name() { + String name() { return name; } - public String key() { + String key() { return parentName == null ? name : parentName + "." + name; } - public Type type() { + Type type() { return type; } } - public static class UpdateType extends SchemaUpdate { + static class UpdateType extends SchemaUpdate { private final String name; private final PrimitiveType type; - public UpdateType(String name, PrimitiveType type) { + UpdateType(String name, PrimitiveType type) { this.name = name; this.type = type; } - public String name() { + String name() { return name; } - public PrimitiveType type() { + PrimitiveType type() { return type; } } - public static class MakeOptional extends SchemaUpdate { + static class MakeOptional extends SchemaUpdate { private final String name; - public MakeOptional(String name) { + MakeOptional(String name) { this.name = name; } - public String name() { + String name() { return name; } } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java index bf5021e85252..a2e0729fd506 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SchemaUtils.java @@ -71,7 +71,7 @@ class SchemaUtils { private static final Pattern TRANSFORM_REGEX = Pattern.compile("(\\w+)\\((.+)\\)"); - public static PrimitiveType needsDataTypeUpdate(Type currentIcebergType, Schema valueSchema) { + static PrimitiveType needsDataTypeUpdate(Type currentIcebergType, Schema valueSchema) { if (currentIcebergType.typeId() == TypeID.FLOAT && valueSchema.type() == Schema.Type.FLOAT64) { return DoubleType.get(); } @@ -81,7 +81,7 @@ public static PrimitiveType needsDataTypeUpdate(Type currentIcebergType, Schema return null; } - public static void applySchemaUpdates(Table table, SchemaUpdate.Consumer updates) { + static void applySchemaUpdates(Table table, SchemaUpdate.Consumer updates) { if (updates == null || updates.empty()) { // no updates to apply return; @@ -150,7 +150,7 @@ private static boolean isOptional(org.apache.iceberg.Schema schema, MakeOptional return field.isOptional(); } - public static PartitionSpec createPartitionSpec( + static PartitionSpec createPartitionSpec( org.apache.iceberg.Schema schema, List partitionBy) { if (partitionBy.isEmpty()) { return PartitionSpec.unpartitioned(); @@ -209,11 +209,11 @@ private static Pair transformArgPair(String argsStr) { return Pair.of(parts.get(0).trim(), Integer.parseInt(parts.get(1).trim())); } - public static Type toIcebergType(Schema valueSchema, IcebergSinkConfig config) { + static Type toIcebergType(Schema valueSchema, IcebergSinkConfig config) { return new SchemaGenerator(config).toIcebergType(valueSchema); } - public static Type inferIcebergType(Object value, IcebergSinkConfig config) { + static Type inferIcebergType(Object value, IcebergSinkConfig config) { return new SchemaGenerator(config).inferIcebergType(value); } @@ -290,7 +290,7 @@ Type toIcebergType(Schema valueSchema) { } @SuppressWarnings("checkstyle:CyclomaticComplexity") - public Type inferIcebergType(Object value) { + Type inferIcebergType(Object value) { if (value == null) { return null; } else if (value instanceof String) { From 8a6da2aad5c4e8538f5e1cf751089cad76585c48 Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Sat, 9 Mar 2024 16:11:37 -0800 Subject: [PATCH 13/15] added comment --- .../org/apache/iceberg/connect/data/RecordConverter.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 7e5d9cea0ea0..167d0cbf011c 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -153,6 +153,12 @@ protected GenericRecord convertStructValue( throw new IllegalArgumentException("Cannot convert to struct: " + value.getClass().getName()); } + /** + * This method will be called for records when there is no record schema. Also, when there is no + * schema, we infer that map values are struct types. This method might also be called if the + * field value is a map but the Iceberg type is a struct. This can happen if the Iceberg table + * schema is not managed by the sink, i.e. created manually. + */ private GenericRecord convertToStruct( Map map, StructType schema, @@ -187,6 +193,7 @@ private GenericRecord convertToStruct( return result; } + /** This method will be called for records and struct values when there is a record schema. */ private GenericRecord convertToStruct( Struct struct, StructType schema, From 29d4ee1cb3b33e0fba21db40ab1378dd251b845e Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Sat, 9 Mar 2024 16:47:57 -0800 Subject: [PATCH 14/15] add more timestamp convert tests --- .../iceberg/connect/data/RecordConverter.java | 4 +- .../connect/data/RecordConverterTest.java | 45 +++++++++++++++---- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 167d0cbf011c..406a2cba4526 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -507,7 +507,9 @@ private String ensureTimestampFormat(String str) { if (result.charAt(10) == ' ') { result = result.substring(0, 10) + 'T' + result.substring(11); } - if (result.length() > 22 && result.charAt(19) == '+' && result.charAt(22) == ':') { + if (result.length() > 22 + && (result.charAt(19) == '+' || result.charAt(19) == '-') + && result.charAt(22) == ':') { result = result.substring(0, 19) + result.substring(19).replace(":", ""); } return result; diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index fc27b542dc68..57add85a1445 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -23,6 +23,7 @@ import static org.mockito.Mockito.when; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; import java.math.BigDecimal; import java.nio.ByteBuffer; import java.time.Duration; @@ -531,23 +532,42 @@ public void testTimeConversion() { public void testTimestampWithZoneConversion() { OffsetDateTime expected = OffsetDateTime.parse("2023-05-18T11:22:33Z"); long expectedMillis = expected.toInstant().toEpochMilli(); - convertToTimestamps(expected, expectedMillis, TimestampType.withZone()); + assertTimestampConvert(expected, expectedMillis, TimestampType.withZone()); + + // zone should be respected + expected = OffsetDateTime.parse("2023-05-18T03:22:33-08:00"); + List additionalInput = + ImmutableList.of( + "2023-05-18T03:22:33-08", + "2023-05-18 03:22:33-08", + "2023-05-18T03:22:33-08:00", + "2023-05-18 03:22:33-08:00", + "2023-05-18T03:22:33-0800", + "2023-05-18 03:22:33-0800"); + assertTimestampConvert(expected, additionalInput, TimestampType.withZone()); } @Test public void testTimestampWithoutZoneConversion() { LocalDateTime expected = LocalDateTime.parse("2023-05-18T11:22:33"); long expectedMillis = expected.atZone(ZoneOffset.UTC).toInstant().toEpochMilli(); - convertToTimestamps(expected, expectedMillis, TimestampType.withoutZone()); - } + assertTimestampConvert(expected, expectedMillis, TimestampType.withoutZone()); - private void convertToTimestamps(Temporal expected, long expectedMillis, TimestampType type) { - Table table = mock(Table.class); - when(table.schema()).thenReturn(SIMPLE_SCHEMA); - RecordConverter converter = new RecordConverter(table, config); + // zone should be ignored + List additionalInput = + ImmutableList.of( + "2023-05-18T11:22:33-08", + "2023-05-18 11:22:33-08", + "2023-05-18T11:22:33-08:00", + "2023-05-18 11:22:33-08:00", + "2023-05-18T11:22:33-0800", + "2023-05-18 11:22:33-0800"); + assertTimestampConvert(expected, additionalInput, TimestampType.withoutZone()); + } + private void assertTimestampConvert(Temporal expected, long expectedMillis, TimestampType type) { List inputList = - ImmutableList.of( + Lists.newArrayList( "2023-05-18T11:22:33Z", "2023-05-18 11:22:33Z", "2023-05-18T11:22:33+00", @@ -563,6 +583,15 @@ private void convertToTimestamps(Temporal expected, long expectedMillis, Timesta OffsetDateTime.ofInstant(Instant.ofEpochMilli(expectedMillis), ZoneOffset.UTC), LocalDateTime.ofInstant(Instant.ofEpochMilli(expectedMillis), ZoneOffset.UTC)); + assertTimestampConvert(expected, inputList, type); + } + + private void assertTimestampConvert( + Temporal expected, List inputList, TimestampType type) { + Table table = mock(Table.class); + when(table.schema()).thenReturn(SIMPLE_SCHEMA); + RecordConverter converter = new RecordConverter(table, config); + inputList.forEach( input -> { Temporal ts = converter.convertTimestampValue(input, type); From 1207d9cc191b97a5efd1a42c632e40c34191002d Mon Sep 17 00:00:00 2001 From: Bryan Keller Date: Sat, 9 Mar 2024 17:06:20 -0800 Subject: [PATCH 15/15] checkstyle --- .../org/apache/iceberg/connect/data/RecordConverterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java index 57add85a1445..b494a9da85d3 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/RecordConverterTest.java @@ -23,7 +23,6 @@ import static org.mockito.Mockito.when; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; import java.math.BigDecimal; import java.nio.ByteBuffer; import java.time.Duration; @@ -53,6 +52,7 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.BinaryType;