From 47ca107bf680d27dcc1f947f9d7f37e1005a48a8 Mon Sep 17 00:00:00 2001 From: manuzhang Date: Sat, 14 Dec 2024 00:28:49 +0800 Subject: [PATCH] Remove Hive 2 --- gradle.properties | 2 - gradle/libs.versions.toml | 6 - mr/build.gradle | 27 +- .../ql/exec/vector/VectorizedSupport.java | 46 - .../apache/iceberg/mr/InputFormatConfig.java | 10 - .../apache/iceberg/mr/hive/Deserializer.java | 302 ---- .../mr/hive/HiveIcebergFilterFactory.java | 210 --- .../mr/hive/HiveIcebergInputFormat.java | 143 -- .../iceberg/mr/hive/HiveIcebergMetaHook.java | 311 ---- .../mr/hive/HiveIcebergOutputCommitter.java | 523 ------- .../mr/hive/HiveIcebergOutputFormat.java | 113 -- .../mr/hive/HiveIcebergRecordWriter.java | 127 -- .../iceberg/mr/hive/HiveIcebergSerDe.java | 213 --- .../iceberg/mr/hive/HiveIcebergSplit.java | 95 -- .../mr/hive/HiveIcebergStorageHandler.java | 323 ---- .../org/apache/iceberg/mr/hive/TezUtil.java | 131 -- .../IcebergBinaryObjectInspector.java | 74 - .../IcebergDateObjectInspector.java | 72 - .../IcebergDecimalObjectInspector.java | 90 -- .../IcebergFixedObjectInspector.java | 67 - .../IcebergObjectInspector.java | 151 -- .../IcebergRecordObjectInspector.java | 183 --- .../IcebergTimeObjectInspector.java | 68 - .../IcebergTimestampObjectInspector.java | 72 - ...ebergTimestampWithZoneObjectInspector.java | 73 - .../IcebergUUIDObjectInspector.java | 68 - .../objectinspector/WriteObjectInspector.java | 28 - .../mr/mapreduce/IcebergInputFormat.java | 234 ++- .../iceberg/mr/TestIcebergInputFormats.java | 27 - .../HiveIcebergStorageHandlerTestUtils.java | 111 -- .../iceberg/mr/hive/HiveIcebergTestUtils.java | 297 ---- .../iceberg/mr/hive/TestDeserializer.java | 205 --- .../mr/hive/TestHiveIcebergFilterFactory.java | 370 ----- .../hive/TestHiveIcebergOutputCommitter.java | 344 ----- .../iceberg/mr/hive/TestHiveIcebergSerDe.java | 79 - ...estHiveIcebergStorageHandlerLocalScan.java | 789 ---------- .../TestHiveIcebergStorageHandlerNoScan.java | 959 ------------ ...TestHiveIcebergStorageHandlerTimezone.java | 184 --- ...stHiveIcebergStorageHandlerWithEngine.java | 1364 ----------------- ...ergStorageHandlerWithMultipleCatalogs.java | 189 --- ...stHiveIcebergWithHiveAutogatherEnable.java | 185 --- .../apache/iceberg/mr/hive/TestHiveShell.java | 210 --- .../apache/iceberg/mr/hive/TestTables.java | 598 -------- .../TestIcebergBinaryObjectInspector.java | 73 - .../TestIcebergDateObjectInspector.java | 65 - .../TestIcebergDecimalObjectInspector.java | 79 - .../TestIcebergFixedObjectInspector.java | 64 - .../TestIcebergObjectInspector.java | 249 --- .../TestIcebergRecordObjectInspector.java | 83 - .../TestIcebergTimeObjectInspector.java | 67 - .../TestIcebergTimestampObjectInspector.java | 67 - ...ebergTimestampWithZoneObjectInspector.java | 75 - .../TestIcebergUUIDObjectInspector.java | 66 - settings.gradle | 16 +- 54 files changed, 110 insertions(+), 10467 deletions(-) delete mode 100644 mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java delete mode 100644 mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java delete mode 100644 mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java diff --git a/gradle.properties b/gradle.properties index f860f958b579..f6371942e83b 100644 --- a/gradle.properties +++ b/gradle.properties @@ -18,8 +18,6 @@ jmhJsonOutputPath=build/reports/jmh/results.json jmhIncludeRegex=.* systemProp.defaultFlinkVersions=1.20 systemProp.knownFlinkVersions=1.18,1.19,1.20 -systemProp.defaultHiveVersions=2 -systemProp.knownHiveVersions=2,4 systemProp.defaultSparkVersions=3.5 systemProp.knownSparkVersions=3.3,3.4,3.5 systemProp.defaultKafkaVersions=3 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index e389627a6484..037fb1270f16 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -49,7 +49,6 @@ guava = "33.3.1-jre" hadoop2 = "2.7.3" hadoop3 = "3.4.1" httpcomponents-httpclient5 = "5.4.1" -hive2 = { strictly = "2.3.9"} # see rich version usage explanation above hive4 = "4.0.1" immutables-value = "2.10.1" jackson-bom = "2.18.2" @@ -135,14 +134,9 @@ hadoop2-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-c hadoop2-minicluster = { module = "org.apache.hadoop:hadoop-minicluster", version.ref = "hadoop2" } hadoop3-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop3" } hadoop3-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop3" } -hive2-exec = { module = "org.apache.hive:hive-exec", version.ref = "hive2" } -hive2-metastore = { module = "org.apache.hive:hive-metastore", version.ref = "hive2" } -hive2-serde = { module = "org.apache.hive:hive-serde", version.ref = "hive2" } -hive2-service = { module = "org.apache.hive:hive-service", version.ref = "hive2" } hive4-exec = { module = "org.apache.hive:hive-exec", version.ref = "hive4" } hive4-metastore = { module = "org.apache.hive:hive-metastore", version.ref = "hive4" } hive4-standalone-metastore-server = { module = "org.apache.hive:hive-standalone-metastore-server", version.ref = "hive4" } -hive4-serde = { module = "org.apache.hive:hive-serde", version.ref = "hive4" } hive4-service = { module = "org.apache.hive:hive-service", version.ref = "hive4" } httpcomponents-httpclient5 = { module = "org.apache.httpcomponents.client5:httpclient5", version.ref = "httpcomponents-httpclient5" } immutables-value = { module = "org.immutables:value", version.ref = "immutables-value" } diff --git a/mr/build.gradle b/mr/build.gradle index bf8f9ee943f7..6732b117a1c7 100644 --- a/mr/build.gradle +++ b/mr/build.gradle @@ -33,7 +33,6 @@ project(':iceberg-mr') { implementation project(':iceberg-common') implementation project(':iceberg-core') api project(':iceberg-data') - implementation project(':iceberg-hive-metastore') implementation project(':iceberg-orc') implementation project(':iceberg-parquet') @@ -41,21 +40,15 @@ project(':iceberg-mr') { exclude group: 'org.apache.avro', module: 'avro' } - compileOnly("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'com.google.guava' + implementation libs.caffeine + implementation libs.parquet.column + implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") { + exclude group: 'org.apache.hadoop' + exclude group: 'commons-lang' + // These artifacts are shaded and included in the orc-core fat jar exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.apache.hive', module: 'hive-storage-api' } - compileOnly libs.hive2.metastore - compileOnly libs.hive2.serde - - implementation libs.caffeine testImplementation libs.calcite.core testImplementation libs.calcite.druid @@ -63,18 +56,20 @@ project(':iceberg-mr') { testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-hive-metastore') testImplementation libs.avro.avro testImplementation libs.calcite.core testImplementation libs.kryo.shaded testImplementation platform(libs.jackson.bom) testImplementation libs.jackson.annotations - testImplementation(libs.hive2.service) { + testImplementation(libs.hive4.service) { exclude group: 'org.apache.hive', module: 'hive-exec' + exclude group: 'org.pac4j', module: 'pac4j-saml-opensamlv3' } testImplementation libs.tez08.dag testImplementation libs.tez08.mapreduce + testImplementation libs.junit.vintage.engine } test { diff --git a/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java b/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java deleted file mode 100644 index b6dd984a5843..000000000000 --- a/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.hadoop.hive.ql.exec.vector; - -import java.util.Locale; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** Copied here from Hive for compatibility */ -@SuppressWarnings("VisibilityModifier") -public class VectorizedSupport { - public enum Support { - DECIMAL_64; - - final String lowerCaseName; - - Support() { - this.lowerCaseName = name().toLowerCase(Locale.ROOT); - } - - @SuppressWarnings("checkstyle:ConstantName") - public static final Map nameToSupportMap = Maps.newHashMap(); - - static { - for (Support support : values()) { - nameToSupportMap.put(support.lowerCaseName, support); - } - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java index 415eb8c9b858..0b0bd03f92d7 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java +++ b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java @@ -76,11 +76,6 @@ private InputFormatConfig() {} public static final String CATALOG_CONFIG_PREFIX = "iceberg.catalog."; - public enum InMemoryDataModel { - HIVE, - GENERIC // Default data model is of Iceberg Generics - } - public static class ConfigBuilder { private final Configuration conf; @@ -163,11 +158,6 @@ public ConfigBuilder preferLocality() { return this; } - public ConfigBuilder useHiveRows() { - conf.set(IN_MEMORY_DATA_MODEL, InMemoryDataModel.HIVE.name()); - return this; - } - /** * Compute platforms pass down filters to data sources. If the data source cannot apply some * filters, or only partially applies the filter, it will return the residual filter back. If diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java b/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java deleted file mode 100644 index 585242cd1119..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.List; -import java.util.Map; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.mr.hive.serde.objectinspector.WriteObjectInspector; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.schema.SchemaWithPartnerVisitor; -import org.apache.iceberg.types.Type.PrimitiveType; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.types.Types.StructType; - -class Deserializer { - private final FieldDeserializer fieldDeserializer; - - /** - * Builder to create a Deserializer instance. Requires an Iceberg Schema and the Hive - * ObjectInspector for converting the data. - */ - static class Builder { - private Schema schema; - private StructObjectInspector writerInspector; - private StructObjectInspector sourceInspector; - - Builder schema(Schema mainSchema) { - this.schema = mainSchema; - return this; - } - - Builder writerInspector(StructObjectInspector inspector) { - this.writerInspector = inspector; - return this; - } - - Builder sourceInspector(StructObjectInspector inspector) { - this.sourceInspector = inspector; - return this; - } - - Deserializer build() { - return new Deserializer(schema, new ObjectInspectorPair(writerInspector, sourceInspector)); - } - } - - /** - * Deserializes the Hive result object to an Iceberg record using the provided ObjectInspectors. - * - * @param data The Hive data to deserialize - * @return The resulting Iceberg Record - */ - Record deserialize(Object data) { - return (Record) fieldDeserializer.value(data); - } - - private Deserializer(Schema schema, ObjectInspectorPair pair) { - this.fieldDeserializer = DeserializerVisitor.visit(schema, pair); - } - - private static class DeserializerVisitor - extends SchemaWithPartnerVisitor { - - public static FieldDeserializer visit(Schema schema, ObjectInspectorPair pair) { - return visit( - schema, - new FixNameMappingObjectInspectorPair(schema, pair), - new DeserializerVisitor(), - new PartnerObjectInspectorByNameAccessors()); - } - - @Override - public FieldDeserializer schema( - Schema schema, ObjectInspectorPair pair, FieldDeserializer deserializer) { - return deserializer; - } - - @Override - public FieldDeserializer field( - NestedField field, ObjectInspectorPair pair, FieldDeserializer deserializer) { - return deserializer; - } - - @Override - public FieldDeserializer primitive(PrimitiveType type, ObjectInspectorPair pair) { - return o -> { - if (o == null) { - return null; - } - - ObjectInspector writerFieldInspector = pair.writerInspector(); - ObjectInspector sourceFieldInspector = pair.sourceInspector(); - - Object result = ((PrimitiveObjectInspector) sourceFieldInspector).getPrimitiveJavaObject(o); - if (writerFieldInspector instanceof WriteObjectInspector) { - // If we have a conversion method defined for the ObjectInspector then convert - result = ((WriteObjectInspector) writerFieldInspector).convert(result); - } - - return result; - }; - } - - @Override - public FieldDeserializer struct( - StructType type, ObjectInspectorPair pair, List deserializers) { - Preconditions.checkNotNull(type, "Can not create reader for null type"); - GenericRecord template = GenericRecord.create(type); - return o -> { - if (o == null) { - return null; - } - - List data = - ((StructObjectInspector) pair.sourceInspector()).getStructFieldsDataAsList(o); - // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since - // NAME_MAP_CACHE access - // is eliminated. Using copy here to gain performance. - Record result = template.copy(); - - for (int i = 0; i < deserializers.size(); i++) { - Object fieldValue = data.get(i); - if (fieldValue != null) { - result.set(i, deserializers.get(i).value(fieldValue)); - } else { - result.set(i, null); - } - } - - return result; - }; - } - - @Override - public FieldDeserializer list( - ListType listTypeInfo, ObjectInspectorPair pair, FieldDeserializer deserializer) { - return o -> { - if (o == null) { - return null; - } - - List result = Lists.newArrayList(); - ListObjectInspector listInspector = (ListObjectInspector) pair.sourceInspector(); - - for (Object val : listInspector.getList(o)) { - result.add(deserializer.value(val)); - } - - return result; - }; - } - - @Override - public FieldDeserializer map( - MapType mapType, - ObjectInspectorPair pair, - FieldDeserializer keyDeserializer, - FieldDeserializer valueDeserializer) { - return o -> { - if (o == null) { - return null; - } - - Map result = Maps.newHashMap(); - MapObjectInspector mapObjectInspector = (MapObjectInspector) pair.sourceInspector(); - - for (Map.Entry entry : mapObjectInspector.getMap(o).entrySet()) { - result.put( - keyDeserializer.value(entry.getKey()), valueDeserializer.value(entry.getValue())); - } - return result; - }; - } - } - - private static class PartnerObjectInspectorByNameAccessors - implements SchemaWithPartnerVisitor.PartnerAccessors { - - @Override - public ObjectInspectorPair fieldPartner(ObjectInspectorPair pair, int fieldId, String name) { - String sourceName = pair.sourceName(name); - return new ObjectInspectorPair( - ((StructObjectInspector) pair.writerInspector()) - .getStructFieldRef(name) - .getFieldObjectInspector(), - ((StructObjectInspector) pair.sourceInspector()) - .getStructFieldRef(sourceName) - .getFieldObjectInspector()); - } - - @Override - public ObjectInspectorPair mapKeyPartner(ObjectInspectorPair pair) { - return new ObjectInspectorPair( - ((MapObjectInspector) pair.writerInspector()).getMapKeyObjectInspector(), - ((MapObjectInspector) pair.sourceInspector()).getMapKeyObjectInspector()); - } - - @Override - public ObjectInspectorPair mapValuePartner(ObjectInspectorPair pair) { - return new ObjectInspectorPair( - ((MapObjectInspector) pair.writerInspector()).getMapValueObjectInspector(), - ((MapObjectInspector) pair.sourceInspector()).getMapValueObjectInspector()); - } - - @Override - public ObjectInspectorPair listElementPartner(ObjectInspectorPair pair) { - return new ObjectInspectorPair( - ((ListObjectInspector) pair.writerInspector()).getListElementObjectInspector(), - ((ListObjectInspector) pair.sourceInspector()).getListElementObjectInspector()); - } - } - - private interface FieldDeserializer { - Object value(Object object); - } - - /** - * Hive query results schema column names do not match the target Iceberg column names. Instead we - * have to rely on the column order. To keep the other parts of the code generic we fix this with - * a wrapper around the ObjectInspectorPair. This wrapper maps the Iceberg schema column names - * instead of the Hive column names. - */ - private static class FixNameMappingObjectInspectorPair extends ObjectInspectorPair { - private final Map sourceNameMap; - - FixNameMappingObjectInspectorPair(Schema schema, ObjectInspectorPair pair) { - super(pair.writerInspector(), pair.sourceInspector()); - - this.sourceNameMap = Maps.newHashMapWithExpectedSize(schema.columns().size()); - - List fields = - ((StructObjectInspector) sourceInspector()).getAllStructFieldRefs(); - for (int i = 0; i < schema.columns().size(); ++i) { - sourceNameMap.put(schema.columns().get(i).name(), fields.get(i).getFieldName()); - } - } - - @Override - String sourceName(String originalName) { - return sourceNameMap.get(originalName); - } - } - - /** - * To get the data for Iceberg {@link Record}s we have to use both ObjectInspectors. - * - *

We use the Hive ObjectInspectors (sourceInspector) to get the Hive primitive types. - * - *

We use the Iceberg ObjectInspectors (writerInspector) only if conversion is needed for - * generating the correct type for Iceberg Records. See: {@link WriteObjectInspector} interface on - * the provided writerInspector. - */ - private static class ObjectInspectorPair { - private final ObjectInspector writerInspector; - private final ObjectInspector sourceInspector; - - ObjectInspectorPair(ObjectInspector writerInspector, ObjectInspector sourceInspector) { - this.writerInspector = writerInspector; - this.sourceInspector = sourceInspector; - } - - ObjectInspector writerInspector() { - return writerInspector; - } - - ObjectInspector sourceInspector() { - return sourceInspector; - } - - String sourceName(String originalName) { - return originalName; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java deleted file mode 100644 index d3d53ed31917..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.or; - -import java.math.BigDecimal; -import java.sql.Date; -import java.sql.Timestamp; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.hadoop.hive.ql.io.sarg.ExpressionTree; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.NaNUtil; - -public class HiveIcebergFilterFactory { - - private HiveIcebergFilterFactory() {} - - public static Expression generateFilterExpression(SearchArgument sarg) { - return translate(sarg.getExpression(), sarg.getLeaves()); - } - - /** - * Recursive method to traverse down the ExpressionTree to evaluate each expression and its leaf - * nodes. - * - * @param tree Current ExpressionTree where the 'top' node is being evaluated. - * @param leaves List of all leaf nodes within the tree. - * @return Expression that is translated from the Hive SearchArgument. - */ - private static Expression translate(ExpressionTree tree, List leaves) { - List childNodes = tree.getChildren(); - switch (tree.getOperator()) { - case OR: - Expression orResult = Expressions.alwaysFalse(); - for (ExpressionTree child : childNodes) { - orResult = or(orResult, translate(child, leaves)); - } - return orResult; - case AND: - Expression result = Expressions.alwaysTrue(); - for (ExpressionTree child : childNodes) { - result = and(result, translate(child, leaves)); - } - return result; - case NOT: - return not(translate(childNodes.get(0), leaves)); - case LEAF: - if (tree.getLeaf() >= leaves.size()) { - throw new UnsupportedOperationException("No more leaves are available"); - } - return translateLeaf(leaves.get(tree.getLeaf())); - case CONSTANT: - throw new UnsupportedOperationException("CONSTANT operator is not supported"); - default: - throw new UnsupportedOperationException("Unknown operator: " + tree.getOperator()); - } - } - - /** - * Translate leaf nodes from Hive operator to Iceberg operator. - * - * @param leaf Leaf node - * @return Expression fully translated from Hive PredicateLeaf - */ - private static Expression translateLeaf(PredicateLeaf leaf) { - String column = leaf.getColumnName(); - switch (leaf.getOperator()) { - case EQUALS: - Object literal = leafToLiteral(leaf); - return NaNUtil.isNaN(literal) ? isNaN(column) : equal(column, literal); - case LESS_THAN: - return lessThan(column, leafToLiteral(leaf)); - case LESS_THAN_EQUALS: - return lessThanOrEqual(column, leafToLiteral(leaf)); - case IN: - return in(column, leafToLiteralList(leaf)); - case BETWEEN: - List icebergLiterals = leafToLiteralList(leaf); - if (icebergLiterals.size() < 2) { - throw new UnsupportedOperationException("Missing leaf literals: " + leaf); - } - return and( - greaterThanOrEqual(column, icebergLiterals.get(0)), - lessThanOrEqual(column, icebergLiterals.get(1))); - case IS_NULL: - return isNull(column); - default: - throw new UnsupportedOperationException("Unknown operator: " + leaf.getOperator()); - } - } - - // PredicateLeafImpl has a work-around for Kryo serialization with java.util.Date objects where it - // converts values to - // Timestamp using Date#getTime. This conversion discards microseconds, so this is a necessary to - // avoid it. - private static final DynFields.UnboundField LITERAL_FIELD = - DynFields.builder().hiddenImpl(SearchArgumentImpl.PredicateLeafImpl.class, "literal").build(); - - private static Object leafToLiteral(PredicateLeaf leaf) { - switch (leaf.getType()) { - case LONG: - case BOOLEAN: - case STRING: - case FLOAT: - return leaf.getLiteral(); - case DATE: - if (leaf.getLiteral() instanceof Date) { - return daysFromDate((Date) leaf.getLiteral()); - } - return daysFromTimestamp((Timestamp) leaf.getLiteral()); - case TIMESTAMP: - return microsFromTimestamp((Timestamp) LITERAL_FIELD.get(leaf)); - case DECIMAL: - return hiveDecimalToBigDecimal((HiveDecimalWritable) leaf.getLiteral()); - - default: - throw new UnsupportedOperationException("Unknown type: " + leaf.getType()); - } - } - - private static List leafToLiteralList(PredicateLeaf leaf) { - switch (leaf.getType()) { - case LONG: - case BOOLEAN: - case FLOAT: - case STRING: - return leaf.getLiteralList(); - case DATE: - return leaf.getLiteralList().stream() - .map(value -> daysFromDate((Date) value)) - .collect(Collectors.toList()); - case DECIMAL: - return leaf.getLiteralList().stream() - .map(value -> hiveDecimalToBigDecimal((HiveDecimalWritable) value)) - .collect(Collectors.toList()); - case TIMESTAMP: - return leaf.getLiteralList().stream() - .map(value -> microsFromTimestamp((Timestamp) value)) - .collect(Collectors.toList()); - default: - throw new UnsupportedOperationException("Unknown type: " + leaf.getType()); - } - } - - private static BigDecimal hiveDecimalToBigDecimal(HiveDecimalWritable hiveDecimalWritable) { - return hiveDecimalWritable - .getHiveDecimal() - .bigDecimalValue() - .setScale(hiveDecimalWritable.scale()); - } - - // Hive uses `java.sql.Date.valueOf(lit.toString());` to convert a literal to Date - // Which uses `java.util.Date()` internally to create the object and that uses the - // TimeZone.getDefaultRef() - // To get back the expected date we have to use the LocalDate which gets rid of the TimeZone - // misery as it uses - // the year/month/day to generate the object - private static int daysFromDate(Date date) { - return DateTimeUtil.daysFromDate(date.toLocalDate()); - } - - // Hive uses `java.sql.Timestamp.valueOf(lit.toString());` to convert a literal to Timestamp - // Which again uses `java.util.Date()` internally to create the object which uses the - // TimeZone.getDefaultRef() - // To get back the expected timestamp we have to use the LocalDateTime which gets rid of the - // TimeZone misery - // as it uses the year/month/day/hour/min/sec/nanos to generate the object - private static int daysFromTimestamp(Timestamp timestamp) { - return DateTimeUtil.daysFromDate(timestamp.toLocalDateTime().toLocalDate()); - } - - // We have to use the LocalDateTime to get the micros. See the comment above. - private static long microsFromTimestamp(Timestamp timestamp) { - return DateTimeUtil.microsFromTimestamp(timestamp.toLocalDateTime()); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java deleted file mode 100644 index 5f2eb9834b63..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.io.IOException; -import java.util.Arrays; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.SerializationUtilities; -import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedSupport; -import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; -import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.TableScanDesc; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.iceberg.common.DynConstructors; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.mapred.AbstractMapredIcebergRecordReader; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.mr.mapred.MapredIcebergInputFormat; -import org.apache.iceberg.mr.mapreduce.IcebergInputFormat; -import org.apache.iceberg.mr.mapreduce.IcebergSplit; -import org.apache.iceberg.mr.mapreduce.IcebergSplitContainer; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.SerializationUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class HiveIcebergInputFormat extends MapredIcebergInputFormat - implements CombineHiveInputFormat.AvoidSplitCombination, VectorizedInputFormatInterface { - - private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergInputFormat.class); - private static final String HIVE_VECTORIZED_RECORDREADER_CLASS = - "org.apache.iceberg.mr.hive.vector.HiveIcebergVectorizedRecordReader"; - private static final DynConstructors.Ctor - HIVE_VECTORIZED_RECORDREADER_CTOR; - - static { - if (HiveVersion.min(HiveVersion.HIVE_3)) { - HIVE_VECTORIZED_RECORDREADER_CTOR = - DynConstructors.builder(AbstractMapredIcebergRecordReader.class) - .impl( - HIVE_VECTORIZED_RECORDREADER_CLASS, - IcebergInputFormat.class, - IcebergSplit.class, - JobConf.class, - Reporter.class) - .build(); - } else { - HIVE_VECTORIZED_RECORDREADER_CTOR = null; - } - } - - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - // Convert Hive filter to Iceberg filter - String hiveFilter = job.get(TableScanDesc.FILTER_EXPR_CONF_STR); - if (hiveFilter != null) { - ExprNodeGenericFuncDesc exprNodeDesc = - SerializationUtilities.deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); - SearchArgument sarg = ConvertAstToSearchArg.create(job, exprNodeDesc); - try { - Expression filter = HiveIcebergFilterFactory.generateFilterExpression(sarg); - job.set(InputFormatConfig.FILTER_EXPRESSION, SerializationUtil.serializeToBase64(filter)); - } catch (UnsupportedOperationException e) { - LOG.warn( - "Unable to create Iceberg filter, continuing without filter (will be applied by Hive later): ", - e); - } - } - - String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(job); - job.setStrings(InputFormatConfig.SELECTED_COLUMNS, selectedColumns); - - String location = job.get(InputFormatConfig.TABLE_LOCATION); - return Arrays.stream(super.getSplits(job, numSplits)) - .map(split -> new HiveIcebergSplit((IcebergSplit) split, location)) - .toArray(InputSplit[]::new); - } - - @Override - public RecordReader> getRecordReader( - InputSplit split, JobConf job, Reporter reporter) throws IOException { - String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(job); - job.setStrings(InputFormatConfig.SELECTED_COLUMNS, selectedColumns); - - if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) - && Utilities.getVectorizedRowBatchCtx(job) != null) { - Preconditions.checkArgument( - HiveVersion.min(HiveVersion.HIVE_3), "Vectorization only supported for Hive 3+"); - - job.setEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.HIVE); - job.setBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, true); - - IcebergSplit icebergSplit = ((IcebergSplitContainer) split).icebergSplit(); - // bogus cast for favouring code reuse over syntax - return (RecordReader) - HIVE_VECTORIZED_RECORDREADER_CTOR.newInstance( - new IcebergInputFormat<>(), icebergSplit, job, reporter); - } else { - return super.getRecordReader(split, job, reporter); - } - } - - @Override - public boolean shouldSkipCombine(Path path, Configuration conf) { - return true; - } - - // Override annotation commented out, since this interface method has been introduced only in Hive - // 3 - // @Override - public VectorizedSupport.Support[] getSupportedFeatures() { - return new VectorizedSupport.Support[0]; - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java deleted file mode 100644 index 637dc6d15df1..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.List; -import java.util.Locale; -import java.util.Properties; -import java.util.Set; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.HiveMetaHook; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.iceberg.BaseMetastoreTableOperations; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableMetadataParser; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.hive.HiveSchemaUtil; -import org.apache.iceberg.hive.HiveTableOperations; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class HiveIcebergMetaHook implements HiveMetaHook { - private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergMetaHook.class); - private static final Set PARAMETERS_TO_REMOVE = - ImmutableSet.of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME); - private static final Set PROPERTIES_TO_REMOVE = - ImmutableSet - // We don't want to push down the metadata location props to Iceberg from HMS, - // since the snapshot pointer in HMS would always be one step ahead - .of( - BaseMetastoreTableOperations.METADATA_LOCATION_PROP, - BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP, - // Initially we'd like to cache the partition spec in HMS, but not push it down later to - // Iceberg during alter - // table commands since by then the HMS info can be stale + Iceberg does not store its - // partition spec in the props - InputFormatConfig.PARTITION_SPEC); - - private final Configuration conf; - private Table icebergTable = null; - private Properties catalogProperties; - private boolean deleteIcebergTable; - private FileIO deleteIo; - private TableMetadata deleteMetadata; - - public HiveIcebergMetaHook(Configuration conf) { - this.conf = conf; - } - - @Override - public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { - this.catalogProperties = getCatalogProperties(hmsTable); - - // Set the table type even for non HiveCatalog based tables - hmsTable - .getParameters() - .put( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ROOT)); - - if (!Catalogs.hiveCatalog(conf, catalogProperties)) { - // For non-HiveCatalog tables too, we should set the input and output format - // so that the table can be read by other engines like Impala - hmsTable.getSd().setInputFormat(HiveIcebergInputFormat.class.getCanonicalName()); - hmsTable.getSd().setOutputFormat(HiveIcebergOutputFormat.class.getCanonicalName()); - - // If not using HiveCatalog check for existing table - try { - this.icebergTable = Catalogs.loadTable(conf, catalogProperties); - - Preconditions.checkArgument( - catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null, - "Iceberg table already created - can not use provided schema"); - Preconditions.checkArgument( - catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null, - "Iceberg table already created - can not use provided partition specification"); - - LOG.info("Iceberg table already exists {}", icebergTable); - - return; - } catch (NoSuchTableException nte) { - // If the table does not exist we will create it below - } - } - - // If the table does not exist collect data for table creation - // - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the - // user can override the - // Iceberg schema and specification generated by the code - - Schema schema = schema(catalogProperties, hmsTable); - PartitionSpec spec = spec(schema, hmsTable); - - // If there are partition keys specified remove them from the HMS table and add them to the - // column list - if (hmsTable.isSetPartitionKeys()) { - hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys()); - hmsTable.setPartitionKeysIsSet(false); - } - - catalogProperties.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(schema)); - catalogProperties.put(InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(spec)); - - // Allow purging table data if the table is created now and not set otherwise - hmsTable.getParameters().putIfAbsent(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE"); - - // If the table is not managed by Hive catalog then the location should be set - if (!Catalogs.hiveCatalog(conf, catalogProperties)) { - Preconditions.checkArgument( - hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null, - "Table location not set"); - } - - // Remove creation related properties - PARAMETERS_TO_REMOVE.forEach(hmsTable.getParameters()::remove); - } - - @Override - public void rollbackCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { - // do nothing - } - - @Override - public void commitCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { - if (icebergTable == null) { - if (Catalogs.hiveCatalog(conf, catalogProperties)) { - catalogProperties.put(TableProperties.ENGINE_HIVE_ENABLED, true); - } - - Catalogs.createTable(conf, catalogProperties); - } - } - - @Override - public void preDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { - this.catalogProperties = getCatalogProperties(hmsTable); - this.deleteIcebergTable = - hmsTable.getParameters() != null - && "TRUE" - .equalsIgnoreCase( - hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); - - if (deleteIcebergTable && Catalogs.hiveCatalog(conf, catalogProperties)) { - // Store the metadata and the io for deleting the actual table data - try { - String metadataLocation = - hmsTable.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); - this.deleteIo = Catalogs.loadTable(conf, catalogProperties).io(); - this.deleteMetadata = TableMetadataParser.read(deleteIo, metadataLocation); - } catch (Exception e) { - LOG.error( - "preDropTable: Error during loading Iceberg table or parsing its metadata for HMS table: {}.{}. " - + "In some cases, this might lead to undeleted metadata files under the table directory: {}. " - + "Please double check and, if needed, manually delete any dangling files/folders, if any. " - + "In spite of this error, the HMS table drop operation should proceed as normal.", - hmsTable.getDbName(), - hmsTable.getTableName(), - hmsTable.getSd().getLocation(), - e); - } - } - } - - @Override - public void rollbackDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { - // do nothing - } - - @Override - public void commitDropTable( - org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean deleteData) { - if (deleteData && deleteIcebergTable) { - try { - if (!Catalogs.hiveCatalog(conf, catalogProperties)) { - LOG.info( - "Dropping with purge all the data for table {}.{}", - hmsTable.getDbName(), - hmsTable.getTableName()); - Catalogs.dropTable(conf, catalogProperties); - } else { - // do nothing if metadata folder has been deleted already (Hive 4 behaviour for - // purge=TRUE) - if (deleteMetadata != null && deleteIo.newInputFile(deleteMetadata.location()).exists()) { - CatalogUtil.dropTableData(deleteIo, deleteMetadata); - } - } - } catch (Exception e) { - // we want to successfully complete the Hive DROP TABLE command despite catalog-related - // exceptions here - // e.g. we wish to successfully delete a Hive table even if the underlying Hadoop table has - // already been deleted - LOG.warn( - "Exception during commitDropTable operation for table {}.{}.", - hmsTable.getDbName(), - hmsTable.getTableName(), - e); - } - } - } - - /** - * Calculates the properties we would like to send to the catalog. - * - *
    - *
  • The base of the properties is the properties stored at the Hive Metastore for the given - * table - *
  • We add the {@link Catalogs#LOCATION} as the table location - *
  • We add the {@link Catalogs#NAME} as TableIdentifier defined by the database name and - * table name - *
  • We remove some parameters that we don't want to push down to the Iceberg table props - *
- * - * @param hmsTable Table for which we are calculating the properties - * @return The properties we can provide for Iceberg functions, like {@link Catalogs} - */ - private static Properties getCatalogProperties( - org.apache.hadoop.hive.metastore.api.Table hmsTable) { - Properties properties = new Properties(); - - hmsTable - .getParameters() - .forEach( - (key, value) -> { - // translate key names between HMS and Iceberg where needed - String icebergKey = HiveTableOperations.translateToIcebergProp(key); - properties.put(icebergKey, value); - }); - - if (properties.get(Catalogs.LOCATION) == null - && hmsTable.getSd() != null - && hmsTable.getSd().getLocation() != null) { - properties.put(Catalogs.LOCATION, hmsTable.getSd().getLocation()); - } - - if (properties.get(Catalogs.NAME) == null) { - properties.put( - Catalogs.NAME, - TableIdentifier.of(hmsTable.getDbName(), hmsTable.getTableName()).toString()); - } - - // Remove HMS table parameters we don't want to propagate to Iceberg - PROPERTIES_TO_REMOVE.forEach(properties::remove); - - return properties; - } - - private Schema schema( - Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) { - boolean autoConversion = conf.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false); - - if (properties.getProperty(InputFormatConfig.TABLE_SCHEMA) != null) { - return SchemaParser.fromJson(properties.getProperty(InputFormatConfig.TABLE_SCHEMA)); - } else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) { - // Add partitioning columns to the original column list before creating the Iceberg Schema - List cols = Lists.newArrayList(hmsTable.getSd().getCols()); - cols.addAll(hmsTable.getPartitionKeys()); - return HiveSchemaUtil.convert(cols, autoConversion); - } else { - return HiveSchemaUtil.convert(hmsTable.getSd().getCols(), autoConversion); - } - } - - private static PartitionSpec spec( - Schema schema, org.apache.hadoop.hive.metastore.api.Table hmsTable) { - - if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) { - Preconditions.checkArgument( - !hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(), - "Provide only one of the following: Hive partition specification, or the " - + InputFormatConfig.PARTITION_SPEC - + " property"); - return PartitionSpecParser.fromJson( - schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC)); - } else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) { - // If the table is partitioned then generate the identity partition definitions for the - // Iceberg table - return HiveSchemaUtil.spec(schema, hmsTable.getPartitionKeys()); - } else { - return PartitionSpec.unpartitioned(); - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java deleted file mode 100644 index 5b1b8222534f..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; -import java.util.Optional; -import java.util.Properties; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.JobContext; -import org.apache.hadoop.mapred.OutputCommitter; -import org.apache.hadoop.mapred.TaskAttemptContext; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapreduce.JobID; -import org.apache.hadoop.mapreduce.TaskType; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.exceptions.NotFoundException; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder; -import org.apache.iceberg.util.Tasks; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An Iceberg table committer for adding data files to the Iceberg tables. Currently independent of - * the Hive ACID transactions. - */ -public class HiveIcebergOutputCommitter extends OutputCommitter { - private static final String FOR_COMMIT_EXTENSION = ".forCommit"; - - private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergOutputCommitter.class); - - @Override - public void setupJob(JobContext jobContext) { - // do nothing. - } - - @Override - public void setupTask(TaskAttemptContext taskAttemptContext) { - // do nothing. - } - - @Override - public boolean needsTaskCommit(TaskAttemptContext context) { - // We need to commit if this is the last phase of a MapReduce process - return TaskType.REDUCE.equals(context.getTaskAttemptID().getTaskID().getTaskType()) - || context.getJobConf().getNumReduceTasks() == 0; - } - - /** - * Collects the generated data files and creates a commit file storing the data file list. - * - * @param originalContext The task attempt context - * @throws IOException Thrown if there is an error writing the commit file - */ - @Override - public void commitTask(TaskAttemptContext originalContext) throws IOException { - TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext); - - TaskAttemptID attemptID = context.getTaskAttemptID(); - JobConf jobConf = context.getJobConf(); - Collection outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf()); - Map writers = - Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)) - .orElseGet( - () -> { - LOG.info( - "CommitTask found no writers for output tables: {}, attemptID: {}", - outputs, - attemptID); - return ImmutableMap.of(); - }); - - ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size()); - try { - // Generates commit files for the target tables in parallel - Tasks.foreach(outputs) - .retry(3) - .stopOnFailure() - .throwFailureWhenFinished() - .executeWith(tableExecutor) - .run( - output -> { - Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output); - if (table != null) { - HiveIcebergRecordWriter writer = writers.get(output); - DataFile[] closedFiles; - if (writer != null) { - closedFiles = writer.dataFiles(); - } else { - LOG.info( - "CommitTask found no writer for specific table: {}, attemptID: {}", - output, - attemptID); - closedFiles = new DataFile[0]; - } - // Creating the file containing the data files generated by this task for this - // table - String fileForCommitLocation = - generateFileForCommitLocation( - table.location(), - jobConf, - attemptID.getJobID(), - attemptID.getTaskID().getId()); - createFileForCommit(closedFiles, fileForCommitLocation, table.io()); - } else { - // When using Tez multi-table inserts, we could have more output tables in config - // than - // the actual tables this task has written to and has serialized in its config - LOG.info("CommitTask found no serialized table in config for table: {}.", output); - } - }, - IOException.class); - } finally { - if (tableExecutor != null) { - tableExecutor.shutdown(); - } - } - - // remove the writer to release the object - HiveIcebergRecordWriter.removeWriters(attemptID); - } - - /** - * Removes files generated by this task. - * - * @param originalContext The task attempt context - * @throws IOException Thrown if there is an error closing the writer - */ - @Override - public void abortTask(TaskAttemptContext originalContext) throws IOException { - TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext); - - // Clean up writer data from the local store - Map writers = - HiveIcebergRecordWriter.removeWriters(context.getTaskAttemptID()); - - // Remove files if it was not done already - if (writers != null) { - for (HiveIcebergRecordWriter writer : writers.values()) { - writer.close(true); - } - } - } - - /** - * Reads the commit files stored in the temp directories and collects the generated committed data - * files. Appends the data files to the tables. At the end removes the temporary directories. - * - * @param originalContext The job context - * @throws IOException if there is a failure accessing the files - */ - @Override - public void commitJob(JobContext originalContext) throws IOException { - JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext); - JobConf jobConf = jobContext.getJobConf(); - - long startTime = System.currentTimeMillis(); - LOG.info("Committing job {} has started", jobContext.getJobID()); - - Collection outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf()); - Collection jobLocations = new ConcurrentLinkedQueue<>(); - - ExecutorService fileExecutor = fileExecutor(jobConf); - ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size()); - try { - // Commits the changes for the output tables in parallel - Tasks.foreach(outputs) - .throwFailureWhenFinished() - .stopOnFailure() - .executeWith(tableExecutor) - .run( - output -> { - Table table = HiveIcebergStorageHandler.table(jobConf, output); - if (table != null) { - String catalogName = HiveIcebergStorageHandler.catalogName(jobConf, output); - jobLocations.add( - generateJobLocation(table.location(), jobConf, jobContext.getJobID())); - commitTable( - table.io(), fileExecutor, jobContext, output, table.location(), catalogName); - } else { - LOG.info( - "CommitJob found no serialized table in config for table: {}. Skipping job commit.", - output); - } - }); - } finally { - fileExecutor.shutdown(); - if (tableExecutor != null) { - tableExecutor.shutdown(); - } - } - - LOG.info( - "Commit took {} ms for job {}", - System.currentTimeMillis() - startTime, - jobContext.getJobID()); - - cleanup(jobContext, jobLocations); - } - - /** - * Removes the generated data files if there is a commit file already generated for them. The - * cleanup at the end removes the temporary directories as well. - * - * @param originalContext The job context - * @param status The status of the job - * @throws IOException if there is a failure deleting the files - */ - @Override - public void abortJob(JobContext originalContext, int status) throws IOException { - JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext); - JobConf jobConf = jobContext.getJobConf(); - - LOG.info("Job {} is aborted. Data file cleaning started", jobContext.getJobID()); - Collection outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf()); - Collection jobLocations = new ConcurrentLinkedQueue<>(); - - ExecutorService fileExecutor = fileExecutor(jobConf); - ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size()); - try { - // Cleans up the changes for the output tables in parallel - Tasks.foreach(outputs) - .suppressFailureWhenFinished() - .executeWith(tableExecutor) - .onFailure((output, exc) -> LOG.warn("Failed cleanup table {} on abort job", output, exc)) - .run( - output -> { - LOG.info("Cleaning table {} with job id {}", output, jobContext.getJobID()); - Table table = HiveIcebergStorageHandler.table(jobConf, output); - jobLocations.add( - generateJobLocation(table.location(), jobConf, jobContext.getJobID())); - Collection dataFiles = - dataFiles(fileExecutor, table.location(), jobContext, table.io(), false); - - // Check if we have files already committed and remove data files if there are any - if (!dataFiles.isEmpty()) { - Tasks.foreach(dataFiles) - .retry(3) - .suppressFailureWhenFinished() - .executeWith(fileExecutor) - .onFailure( - (file, exc) -> - LOG.warn( - "Failed to remove data file {} on abort job", - file.location(), - exc)) - .run(file -> table.io().deleteFile(file.location())); - } - }); - } finally { - fileExecutor.shutdown(); - if (tableExecutor != null) { - tableExecutor.shutdown(); - } - } - - LOG.info("Job {} is aborted. Data file cleaning finished", jobContext.getJobID()); - - cleanup(jobContext, jobLocations); - } - - /** - * Collects the additions to a single table and adds/commits the new files to the Iceberg table. - * - * @param io The io to read the forCommit files - * @param executor The executor used to read the forCommit files - * @param jobContext The job context - * @param name The name of the table used for loading from the catalog - * @param location The location of the table used for loading from the catalog - * @param catalogName The name of the catalog that contains the table - */ - private void commitTable( - FileIO io, - ExecutorService executor, - JobContext jobContext, - String name, - String location, - String catalogName) { - JobConf conf = jobContext.getJobConf(); - Properties catalogProperties = new Properties(); - catalogProperties.put(Catalogs.NAME, name); - catalogProperties.put(Catalogs.LOCATION, location); - if (catalogName != null) { - catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName); - } - Table table = Catalogs.loadTable(conf, catalogProperties); - - long startTime = System.currentTimeMillis(); - LOG.info( - "Committing job has started for table: {}, using location: {}", - table, - generateJobLocation(location, conf, jobContext.getJobID())); - - Collection dataFiles = dataFiles(executor, location, jobContext, io, true); - - if (!dataFiles.isEmpty()) { - // Appending data files to the table - AppendFiles append = table.newAppend(); - dataFiles.forEach(append::appendFile); - append.commit(); - LOG.info( - "Commit took {} ms for table: {} with {} file(s)", - System.currentTimeMillis() - startTime, - table, - dataFiles.size()); - LOG.debug("Added files {}", dataFiles); - } else { - LOG.info( - "Commit took {} ms for table: {} with no new files", - System.currentTimeMillis() - startTime, - table); - } - } - - /** - * Cleans up the jobs temporary locations. For every target table there is a temp dir to clean up. - * - * @param jobContext The job context - * @param jobLocations The locations to clean up - * @throws IOException if there is a failure deleting the files - */ - private void cleanup(JobContext jobContext, Collection jobLocations) throws IOException { - JobConf jobConf = jobContext.getJobConf(); - - LOG.info("Cleaning for job {} started", jobContext.getJobID()); - - // Remove the job's temp directories recursively. - Tasks.foreach(jobLocations) - .retry(3) - .suppressFailureWhenFinished() - .onFailure( - (jobLocation, exc) -> - LOG.debug("Failed to remove directory {} on job cleanup", jobLocation, exc)) - .run( - jobLocation -> { - LOG.info("Cleaning location: {}", jobLocation); - Path toDelete = new Path(jobLocation); - FileSystem fs = Util.getFs(toDelete, jobConf); - fs.delete(toDelete, true); - }, - IOException.class); - - LOG.info("Cleaning for job {} finished", jobContext.getJobID()); - } - - /** - * Executor service for parallel handling of file reads. Should be shared when committing multiple - * tables. - * - * @param conf The configuration containing the pool size - * @return The generated executor service - */ - private static ExecutorService fileExecutor(Configuration conf) { - int size = - conf.getInt( - InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE, - InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE_DEFAULT); - return Executors.newFixedThreadPool( - size, - new ThreadFactoryBuilder() - .setDaemon(true) - .setPriority(Thread.NORM_PRIORITY) - .setNameFormat("iceberg-commit-file-pool-%d") - .build()); - } - - /** - * Executor service for parallel handling of table manipulation. Could return null, if no - * parallelism is possible. - * - * @param conf The configuration containing the pool size - * @param maxThreadNum The number of requests we want to handle (might be decreased further by - * configuration) - * @return The generated executor service, or null if executor is not needed. - */ - private static ExecutorService tableExecutor(Configuration conf, int maxThreadNum) { - int size = - conf.getInt( - InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE, - InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE_DEFAULT); - size = Math.min(maxThreadNum, size); - if (size > 1) { - return Executors.newFixedThreadPool( - size, - new ThreadFactoryBuilder() - .setDaemon(true) - .setPriority(Thread.NORM_PRIORITY) - .setNameFormat("iceberg-commit-table-pool-%d") - .build()); - } else { - return null; - } - } - - /** - * Get the committed data files for this table and job. - * - * @param executor The executor used for reading the forCommit files parallel - * @param location The location of the table - * @param jobContext The job context - * @param io The FileIO used for reading a files generated for commit - * @param throwOnFailure If true then it throws an exception on failure - * @return The list of the committed data files - */ - private static Collection dataFiles( - ExecutorService executor, - String location, - JobContext jobContext, - FileIO io, - boolean throwOnFailure) { - JobConf conf = jobContext.getJobConf(); - // If there are reducers, then every reducer will generate a result file. - // If this is a map only task, then every mapper will generate a result file. - int expectedFiles = - conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks(); - - Collection dataFiles = new ConcurrentLinkedQueue<>(); - - // Reading the committed files. The assumption here is that the taskIds are generated in - // sequential order - // starting from 0. - Tasks.range(expectedFiles) - .throwFailureWhenFinished(throwOnFailure) - .executeWith(executor) - .retry(3) - .run( - taskId -> { - String taskFileName = - generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId); - dataFiles.addAll(Arrays.asList(readFileForCommit(taskFileName, io))); - }); - - return dataFiles; - } - - /** - * Generates the job temp location based on the job configuration. Currently it uses - * TABLE_LOCATION/temp/QUERY_ID-jobId. - * - * @param location The location of the table - * @param conf The job's configuration - * @param jobId The JobID for the task - * @return The file to store the results - */ - @VisibleForTesting - static String generateJobLocation(String location, Configuration conf, JobID jobId) { - String queryId = conf.get(HiveConf.ConfVars.HIVEQUERYID.varname); - return location + "/temp/" + queryId + "-" + jobId; - } - - /** - * Generates file location based on the task configuration and a specific task id. This file will - * be used to store the data required to generate the Iceberg commit. Currently it uses - * TABLE_LOCATION/temp/QUERY_ID-jobId/task-[0..numTasks).forCommit. - * - * @param location The location of the table - * @param conf The job's configuration - * @param jobId The jobId for the task - * @param taskId The taskId for the commit file - * @return The file to store the results - */ - private static String generateFileForCommitLocation( - String location, Configuration conf, JobID jobId, int taskId) { - return generateJobLocation(location, conf, jobId) + "/task-" + taskId + FOR_COMMIT_EXTENSION; - } - - private static void createFileForCommit(DataFile[] closedFiles, String location, FileIO io) - throws IOException { - - OutputFile fileForCommit = io.newOutputFile(location); - try (ObjectOutputStream oos = new ObjectOutputStream(fileForCommit.createOrOverwrite())) { - oos.writeObject(closedFiles); - } - LOG.debug("Iceberg committed file is created {}", fileForCommit); - } - - @SuppressWarnings("DangerousJavaDeserialization") - private static DataFile[] readFileForCommit(String fileForCommitLocation, FileIO io) { - try (ObjectInputStream ois = - new ObjectInputStream(io.newInputFile(fileForCommitLocation).newStream())) { - return (DataFile[]) ois.readObject(); - } catch (ClassNotFoundException | IOException e) { - throw new NotFoundException( - "Can not read or parse committed file: %s", fileForCommitLocation); - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java deleted file mode 100644 index 2ae1d371530c..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.Properties; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator; -import org.apache.hadoop.hive.ql.io.HiveOutputFormat; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.OutputFormat; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.util.Progressable; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.util.PropertyUtil; - -public class HiveIcebergOutputFormat - implements OutputFormat>, - HiveOutputFormat> { - - @Override - public FileSinkOperator.RecordWriter getHiveRecordWriter( - JobConf jc, - Path finalOutPath, - Class valueClass, - boolean isCompressed, - Properties tableAndSerDeProperties, - Progressable progress) { - return writer(jc); - } - - @Override - public org.apache.hadoop.mapred.RecordWriter> getRecordWriter( - FileSystem ignored, JobConf job, String name, Progressable progress) { - return writer(job); - } - - @Override - public void checkOutputSpecs(FileSystem ignored, JobConf job) { - // Not doing any check. - } - - private static HiveIcebergRecordWriter writer(JobConf jc) { - TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc); - // It gets the config from the FileSinkOperator which has its own config for every target table - Table table = - HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME)); - Schema schema = HiveIcebergStorageHandler.schema(jc); - PartitionSpec spec = table.spec(); - FileFormat fileFormat = - FileFormat.fromString( - PropertyUtil.propertyAsString( - table.properties(), - TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT)); - long targetFileSize = - PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - FileIO io = table.io(); - int partitionId = taskAttemptID.getTaskID().getId(); - int taskId = taskAttemptID.getId(); - String operationId = - jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID(); - OutputFileFactory outputFileFactory = - OutputFileFactory.builderFor(table, partitionId, taskId) - .format(fileFormat) - .operationId(operationId) - .build(); - String tableName = jc.get(Catalogs.NAME); - - return new HiveIcebergRecordWriter( - schema, - spec, - fileFormat, - new GenericAppenderFactory(schema, spec), - outputFileFactory, - io, - targetFileSize, - taskAttemptID, - tableName); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java deleted file mode 100644 index 9fc578e88a62..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.io.IOException; -import java.util.Map; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedFanoutWriter; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Tasks; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class HiveIcebergRecordWriter extends PartitionedFanoutWriter - implements FileSinkOperator.RecordWriter, - org.apache.hadoop.mapred.RecordWriter> { - private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergRecordWriter.class); - - // The current key is reused at every write to avoid unnecessary object creation - private final PartitionKey currentKey; - private final FileIO io; - private final InternalRecordWrapper wrapper; - - // > map to store the active writers - // Stored in concurrent map, since some executor engines can share containers - private static final Map> WRITERS = - Maps.newConcurrentMap(); - - static Map removeWriters(TaskAttemptID taskAttemptID) { - return WRITERS.remove(taskAttemptID); - } - - static Map getWriters(TaskAttemptID taskAttemptID) { - return WRITERS.get(taskAttemptID); - } - - HiveIcebergRecordWriter( - Schema schema, - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - TaskAttemptID taskAttemptID, - String tableName) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.io = io; - this.currentKey = new PartitionKey(spec, schema); - this.wrapper = new InternalRecordWrapper(schema.asStruct()); - WRITERS.putIfAbsent(taskAttemptID, Maps.newConcurrentMap()); - WRITERS.get(taskAttemptID).put(tableName, this); - } - - @Override - protected PartitionKey partition(Record row) { - currentKey.partition(wrapper.wrap(row)); - return currentKey; - } - - @Override - public void write(Writable row) throws IOException { - super.write(((Container) row).get()); - } - - @Override - public void write(NullWritable key, Container value) throws IOException { - write(value); - } - - @Override - public void close(boolean abort) throws IOException { - DataFile[] dataFiles = super.dataFiles(); - - // If abort then remove the unnecessary files - if (abort) { - Tasks.foreach(dataFiles) - .executeWith(ThreadPools.getWorkerPool()) - .retry(3) - .suppressFailureWhenFinished() - .onFailure( - (file, exception) -> - LOG.debug("Failed on to remove file {} on abort", file, exception)) - .run(dataFile -> io.deleteFile(dataFile.location())); - } - - LOG.info( - "IcebergRecordWriter is closed with abort={}. Created {} files", abort, dataFiles.length); - } - - @Override - public void close(Reporter reporter) throws IOException { - close(false); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java deleted file mode 100644 index 59bb38e5b2d6..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import javax.annotation.Nullable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.AbstractSerDe; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeStats; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.io.Writable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hive.HiveSchemaUtil; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class HiveIcebergSerDe extends AbstractSerDe { - private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergSerDe.class); - private static final String LIST_COLUMN_COMMENT = "columns.comments"; - - private ObjectInspector inspector; - private Schema tableSchema; - private final Map deserializers = - Maps.newHashMapWithExpectedSize(1); - private final Container row = new Container<>(); - - @Override - public void initialize(@Nullable Configuration configuration, Properties serDeProperties) - throws SerDeException { - // HiveIcebergSerDe.initialize is called multiple places in Hive code: - // - When we are trying to create a table - HiveDDL data is stored at the serDeProperties, but - // no Iceberg table - // is created yet. - // - When we are compiling the Hive query on HiveServer2 side - We only have table information - // (location/name), - // and we have to read the schema using the table data. This is called multiple times so there - // is room for - // optimizing here. - // - When we are executing the Hive query in the execution engine - We do not want to load the - // table data on every - // executor, but serDeProperties are populated by - // HiveIcebergStorageHandler.configureInputJobProperties() and - // the resulting properties are serialized and distributed to the executors - - if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) { - this.tableSchema = - SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA)); - } else { - try { - // always prefer the original table schema if there is one - this.tableSchema = Catalogs.loadTable(configuration, serDeProperties).schema(); - LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema)); - } catch (Exception e) { - boolean autoConversion = - configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false); - // If we can not load the table try the provided hive schema - this.tableSchema = hiveSchemaOrThrow(serDeProperties, e, autoConversion); - } - } - - Schema projectedSchema; - if (serDeProperties.get(HiveIcebergStorageHandler.WRITE_KEY) != null) { - // when writing out data, we should not do projection pushdown - projectedSchema = tableSchema; - } else { - configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false); - String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration); - // When same table is joined multiple times, it is possible some selected columns are - // duplicated, - // in this case wrong recordStructField position leads wrong value or - // ArrayIndexOutOfBoundException - String[] distinctSelectedColumns = - Arrays.stream(selectedColumns).distinct().toArray(String[]::new); - projectedSchema = - distinctSelectedColumns.length > 0 - ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) - : tableSchema; - // the input split mapper handles does not belong to this table - // it is necessary to ensure projectedSchema equals to tableSchema, - // or we cannot find selectOperator's column from inspector - if (projectedSchema.columns().size() != distinctSelectedColumns.length) { - projectedSchema = tableSchema; - } - } - - try { - this.inspector = IcebergObjectInspector.create(projectedSchema); - } catch (Exception e) { - throw new SerDeException(e); - } - } - - @Override - public Class getSerializedClass() { - return Container.class; - } - - @Override - public Writable serialize(Object o, ObjectInspector objectInspector) { - Deserializer deserializer = deserializers.get(objectInspector); - if (deserializer == null) { - deserializer = - new Deserializer.Builder() - .schema(tableSchema) - .sourceInspector((StructObjectInspector) objectInspector) - .writerInspector((StructObjectInspector) inspector) - .build(); - deserializers.put(objectInspector, deserializer); - } - - row.set(deserializer.deserialize(o)); - return row; - } - - @Override - public SerDeStats getSerDeStats() { - return null; - } - - @Override - public Object deserialize(Writable writable) { - return ((Container) writable).get(); - } - - @Override - public ObjectInspector getObjectInspector() { - return inspector; - } - - /** - * Gets the hive schema from the serDeProperties, and throws an exception if it is not provided. - * In the later case it adds the previousException as a root cause. - * - * @param serDeProperties The source of the hive schema - * @param previousException If we had an exception previously - * @param autoConversion When true, convert unsupported types to more permissive - * ones, like tinyint to int - * @return The hive schema parsed from the serDeProperties - * @throws SerDeException If there is no schema information in the serDeProperties - */ - private static Schema hiveSchemaOrThrow( - Properties serDeProperties, Exception previousException, boolean autoConversion) - throws SerDeException { - // Read the configuration parameters - String columnNames = serDeProperties.getProperty(serdeConstants.LIST_COLUMNS); - String columnTypes = serDeProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES); - // No constant for column comments and column comments delimiter. - String columnComments = serDeProperties.getProperty(LIST_COLUMN_COMMENT); - String columnNameDelimiter = - serDeProperties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) - ? serDeProperties.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) - : String.valueOf(SerDeUtils.COMMA); - if (columnNames != null - && columnTypes != null - && columnNameDelimiter != null - && !columnNames.isEmpty() - && !columnTypes.isEmpty() - && !columnNameDelimiter.isEmpty()) { - // Parse the configuration parameters - List names = Lists.newArrayList(); - Collections.addAll(names, columnNames.split(columnNameDelimiter)); - List comments = Lists.newArrayList(); - if (columnComments != null) { - Collections.addAll(comments, columnComments.split(Character.toString(Character.MIN_VALUE))); - } - Schema hiveSchema = - HiveSchemaUtil.convert( - names, - TypeInfoUtils.getTypeInfosFromTypeString(columnTypes), - comments, - autoConversion); - LOG.info("Using hive schema {}", SchemaParser.toJson(hiveSchema)); - return hiveSchema; - } else { - throw new SerDeException( - "Please provide an existing table or a valid schema", previousException); - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java deleted file mode 100644 index 25bc3c523009..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.iceberg.mr.mapreduce.IcebergSplit; -import org.apache.iceberg.mr.mapreduce.IcebergSplitContainer; -import org.apache.iceberg.util.SerializationUtil; - -// Hive requires file formats to return splits that are instances of `FileSplit`. -public class HiveIcebergSplit extends FileSplit implements IcebergSplitContainer { - - private IcebergSplit innerSplit; - - // Hive uses the path name of a split to map it back to a partition (`PartitionDesc`) or table - // description object - // (`TableDesc`) which specifies the relevant input format for reading the files belonging to that - // partition or table. - // That way, `HiveInputFormat` and `CombineHiveInputFormat` can read files with different file - // formats in the same - // MapReduce job and merge compatible splits together. - private String tableLocation; - - // public no-argument constructor for deserialization - public HiveIcebergSplit() {} - - HiveIcebergSplit(IcebergSplit split, String tableLocation) { - this.innerSplit = split; - this.tableLocation = tableLocation; - } - - @Override - public IcebergSplit icebergSplit() { - return innerSplit; - } - - @Override - public long getLength() { - return innerSplit.getLength(); - } - - @Override - public String[] getLocations() { - return innerSplit.getLocations(); - } - - @Override - public Path getPath() { - return new Path(tableLocation); - } - - @Override - public long getStart() { - return 0; - } - - @Override - public void write(DataOutput out) throws IOException { - byte[] bytes = SerializationUtil.serializeToBytes(tableLocation); - out.writeInt(bytes.length); - out.write(bytes); - - innerSplit.write(out); - } - - @Override - public void readFields(DataInput in) throws IOException { - byte[] bytes = new byte[in.readInt()]; - in.readFully(bytes); - tableLocation = SerializationUtil.deserializeFromBytes(bytes); - - innerSplit = new IcebergSplit(); - innerSplit.readFields(in); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java deleted file mode 100644 index da40f4c73ef3..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.io.Serializable; -import java.util.Collection; -import java.util.Map; -import java.util.Properties; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.HiveMetaHook; -import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; -import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; -import org.apache.hadoop.hive.serde2.AbstractSerDe; -import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.OutputFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopConfigurable; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.SerializationUtil; - -public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, HiveStorageHandler { - private static final Splitter TABLE_NAME_SPLITTER = Splitter.on(".."); - private static final String TABLE_NAME_SEPARATOR = ".."; - - static final String WRITE_KEY = "HiveIcebergStorageHandler_write"; - - private Configuration conf; - - @Override - public Class getInputFormatClass() { - return HiveIcebergInputFormat.class; - } - - @Override - public Class getOutputFormatClass() { - return HiveIcebergOutputFormat.class; - } - - @Override - public Class getSerDeClass() { - return HiveIcebergSerDe.class; - } - - @Override - public HiveMetaHook getMetaHook() { - return new HiveIcebergMetaHook(conf); - } - - @Override - public HiveAuthorizationProvider getAuthorizationProvider() { - return null; - } - - @Override - public void configureInputJobProperties(TableDesc tableDesc, Map map) { - overlayTableProperties(conf, tableDesc, map); - } - - @Override - public void configureOutputJobProperties(TableDesc tableDesc, Map map) { - overlayTableProperties(conf, tableDesc, map); - // For Tez, setting the committer here is enough to make sure it'll be part of the jobConf - map.put("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName()); - // For MR, the jobConf is set only in configureJobConf, so we're setting the write key here to - // detect it over there - map.put(WRITE_KEY, "true"); - // Putting the key into the table props as well, so that projection pushdown can be determined - // on a - // table-level and skipped only for output tables in HiveIcebergSerde. Properties from the map - // will be present in - // the serde config for all tables in the query, not just the output tables, so we can't rely on - // that in the serde. - tableDesc.getProperties().put(WRITE_KEY, "true"); - } - - @Override - public void configureTableJobProperties(TableDesc tableDesc, Map map) {} - - // Override annotation commented out, since this interface method has been introduced only in Hive - // 3 - // @Override - public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) {} - - @Override - public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { - if (tableDesc != null - && tableDesc.getProperties() != null - && tableDesc.getProperties().get(WRITE_KEY) != null) { - String tableName = tableDesc.getTableName(); - Preconditions.checkArgument( - !tableName.contains(TABLE_NAME_SEPARATOR), - "Can not handle table " - + tableName - + ". Its name contains '" - + TABLE_NAME_SEPARATOR - + "'"); - String tables = jobConf.get(InputFormatConfig.OUTPUT_TABLES); - tables = tables == null ? tableName : tables + TABLE_NAME_SEPARATOR + tableName; - jobConf.set("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName()); - jobConf.set(InputFormatConfig.OUTPUT_TABLES, tables); - - String catalogName = tableDesc.getProperties().getProperty(InputFormatConfig.CATALOG_NAME); - if (catalogName != null) { - jobConf.set(InputFormatConfig.TABLE_CATALOG_PREFIX + tableName, catalogName); - } - } - } - - @Override - public Configuration getConf() { - return conf; - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public String toString() { - return this.getClass().getName(); - } - - /** - * @param jobConf Job configuration for InputFormat to access - * @param deserializer Deserializer - * @param exprNodeDesc Filter expression extracted by Hive - * @return Entire filter to take advantage of Hive's pruning as well as Iceberg's pruning. - */ - @Override - public DecomposedPredicate decomposePredicate( - JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) { - DecomposedPredicate predicate = new DecomposedPredicate(); - predicate.residualPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc; - predicate.pushedPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc; - return predicate; - } - - /** - * Returns the Table serialized to the configuration based on the table name. If configuration is - * missing from the FileIO of the table, it will be populated with the input config. - * - * @param config The configuration used to get the data from - * @param name The name of the table we need as returned by TableDesc.getTableName() - * @return The Table - */ - public static Table table(Configuration config, String name) { - Table table = - SerializationUtil.deserializeFromBase64( - config.get(InputFormatConfig.SERIALIZED_TABLE_PREFIX + name)); - checkAndSetIoConfig(config, table); - return table; - } - - /** - * If enabled, it populates the FileIO's hadoop configuration with the input config object. This - * might be necessary when the table object was serialized without the FileIO config. - * - * @param config Configuration to set for FileIO, if enabled - * @param table The Iceberg table object - */ - public static void checkAndSetIoConfig(Configuration config, Table table) { - if (table != null - && config.getBoolean( - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) - && table.io() instanceof HadoopConfigurable) { - ((HadoopConfigurable) table.io()).setConf(config); - } - } - - /** - * If enabled, it ensures that the FileIO's hadoop configuration will not be serialized. This - * might be desirable for decreasing the overall size of serialized table objects. - * - *

Note: Skipping FileIO config serialization in this fashion might in turn necessitate calling - * {@link #checkAndSetIoConfig(Configuration, Table)} on the deserializer-side to enable - * subsequent use of the FileIO. - * - * @param config Configuration to set for FileIO in a transient manner, if enabled - * @param table The Iceberg table object - */ - public static void checkAndSkipIoConfigSerialization(Configuration config, Table table) { - if (table != null - && config.getBoolean( - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) - && table.io() instanceof HadoopConfigurable) { - ((HadoopConfigurable) table.io()) - .serializeConfWith(conf -> new NonSerializingConfig(config)::get); - } - } - - /** - * Returns the names of the output tables stored in the configuration. - * - * @param config The configuration used to get the data from - * @return The collection of the table names as returned by TableDesc.getTableName() - */ - public static Collection outputTables(Configuration config) { - return TABLE_NAME_SPLITTER.splitToList(config.get(InputFormatConfig.OUTPUT_TABLES)); - } - - /** - * Returns the catalog name serialized to the configuration. - * - * @param config The configuration used to get the data from - * @param name The name of the table we neeed as returned by TableDesc.getTableName() - * @return catalog name - */ - public static String catalogName(Configuration config, String name) { - return config.get(InputFormatConfig.TABLE_CATALOG_PREFIX + name); - } - - /** - * Returns the Table Schema serialized to the configuration. - * - * @param config The configuration used to get the data from - * @return The Table Schema object - */ - public static Schema schema(Configuration config) { - return SchemaParser.fromJson(config.get(InputFormatConfig.TABLE_SCHEMA)); - } - - /** - * Stores the serializable table data in the configuration. Currently the following is handled: - * - *

    - *
  • - Table - in case the table is serializable - *
  • - Location - *
  • - Schema - *
  • - Partition specification - *
  • - FileIO for handling table files - *
  • - Location provider used for file generation - *
  • - Encryption manager for encryption handling - *
- * - * @param configuration The configuration storing the catalog information - * @param tableDesc The table which we want to store to the configuration - * @param map The map of the configuration properties which we append with the serialized data - */ - @VisibleForTesting - static void overlayTableProperties( - Configuration configuration, TableDesc tableDesc, Map map) { - Properties props = tableDesc.getProperties(); - Table table = Catalogs.loadTable(configuration, props); - String schemaJson = SchemaParser.toJson(table.schema()); - - Maps.fromProperties(props).entrySet().stream() - .filter(entry -> !map.containsKey(entry.getKey())) // map overrides tableDesc properties - .forEach(entry -> map.put(entry.getKey(), entry.getValue())); - - map.put(InputFormatConfig.TABLE_IDENTIFIER, props.getProperty(Catalogs.NAME)); - map.put(InputFormatConfig.TABLE_LOCATION, table.location()); - map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); - - // serialize table object into config - Table serializableTable = SerializableTable.copyOf(table); - checkAndSkipIoConfigSerialization(configuration, serializableTable); - map.put( - InputFormatConfig.SERIALIZED_TABLE_PREFIX + tableDesc.getTableName(), - SerializationUtil.serializeToBase64(serializableTable)); - - // We need to remove this otherwise the job.xml will be invalid as column comments are separated - // with '\0' and - // the serialization utils fail to serialize this character - map.remove("columns.comments"); - - // save schema into table props as well to avoid repeatedly hitting the HMS during serde - // initializations - // this is an exception to the interface documentation, but it's a safe operation to add this - // property - props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); - } - - private static class NonSerializingConfig implements Serializable { - - private final transient Configuration conf; - - NonSerializingConfig(Configuration conf) { - this.conf = conf; - } - - public Configuration get() { - if (conf == null) { - throw new IllegalStateException( - "Configuration was not serialized on purpose but was not set manually either"); - } - - return conf; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java b/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java deleted file mode 100644 index a5cb2dcbc3ac..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.Objects; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.JobContext; -import org.apache.hadoop.mapred.JobContextImpl; -import org.apache.hadoop.mapred.TaskAttemptContext; -import org.apache.hadoop.mapred.TaskAttemptContextImpl; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapreduce.JobID; - -public class TezUtil { - - private static final String TASK_ATTEMPT_ID_KEY = "mapred.task.id"; - // TezProcessor (Hive) propagates the vertex id under this key - available during Task commit - // phase - private static final String TEZ_VERTEX_ID_HIVE = "hive.tez.vertex.index"; - // MROutputCommitter (Tez) propagates the vertex id under this key - available during DAG/Vertex - // commit phase - private static final String TEZ_VERTEX_ID_DAG = "mapreduce.task.vertex.id"; - - /** - * If the Tez vertex id is present in config, creates a new jobContext by appending the Tez vertex - * id to the jobID. For the rationale behind this enrichment, please refer to point #1 in the docs - * of {@link TaskAttemptWrapper}. - * - * @param jobContext original jobContext to be enriched - * @return enriched jobContext - */ - public static JobContext enrichContextWithVertexId(JobContext jobContext) { - String vertexId = jobContext.getJobConf().get(TEZ_VERTEX_ID_DAG); - if (vertexId != null) { - JobID jobID = getJobIDWithVertexAppended(jobContext.getJobID(), vertexId); - return new JobContextImpl(jobContext.getJobConf(), jobID, jobContext.getProgressible()); - } else { - return jobContext; - } - } - - /** - * Creates a new taskAttemptContext by replacing the taskAttemptID with a wrapped object. For the - * rationale behind this enrichment, please refer to point #2 in the docs of {@link - * TaskAttemptWrapper}. - * - * @param taskAttemptContext original taskAttemptContext to be enriched - * @return enriched taskAttemptContext - */ - public static TaskAttemptContext enrichContextWithAttemptWrapper( - TaskAttemptContext taskAttemptContext) { - TaskAttemptID wrapped = TezUtil.taskAttemptWrapper(taskAttemptContext.getTaskAttemptID()); - return new TaskAttemptContextImpl(taskAttemptContext.getJobConf(), wrapped); - } - - public static TaskAttemptID taskAttemptWrapper(TaskAttemptID attemptID) { - return new TaskAttemptWrapper(attemptID, ""); - } - - public static TaskAttemptID taskAttemptWrapper(JobConf jc) { - return new TaskAttemptWrapper( - TaskAttemptID.forName(jc.get(TASK_ATTEMPT_ID_KEY)), jc.get(TEZ_VERTEX_ID_HIVE)); - } - - private static JobID getJobIDWithVertexAppended(JobID jobID, String vertexId) { - if (vertexId != null && !vertexId.isEmpty()) { - return new JobID(jobID.getJtIdentifier() + vertexId, jobID.getId()); - } else { - return jobID; - } - } - - private TezUtil() {} - - /** - * Subclasses {@link org.apache.hadoop.mapred.TaskAttemptID}. It has two main purposes: 1. Provide - * a way to append an optional vertex id to the Job ID. This is needed because there is a - * discrepancy between how the attempt ID is constructed in the {@link - * org.apache.tez.mapreduce.output.MROutput} (with vertex ID appended to the end of the Job ID) - * and how it's available in the mapper (without vertex ID) which creates and caches the - * HiveIcebergRecordWriter object. 2. Redefine the equals/hashcode provided by TaskAttemptID so - * that task type (map or reduce) does not count, and therefore the mapper and reducer threads can - * use the same attempt ID-based key to retrieve the cached HiveIcebergRecordWriter object. - */ - private static class TaskAttemptWrapper extends TaskAttemptID { - - TaskAttemptWrapper(TaskAttemptID attemptID, String vertexId) { - super( - getJobIDWithVertexAppended(attemptID.getJobID(), vertexId).getJtIdentifier(), - attemptID.getJobID().getId(), - attemptID.getTaskType(), - attemptID.getTaskID().getId(), - attemptID.getId()); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - TaskAttemptWrapper that = (TaskAttemptWrapper) o; - return getId() == that.getId() - && getTaskID().getId() == that.getTaskID().getId() - && Objects.equals(getJobID(), that.getJobID()); - } - - @Override - public int hashCode() { - return Objects.hash(getId(), getTaskID().getId(), getJobID()); - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java deleted file mode 100644 index e84993baf1ef..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.nio.ByteBuffer; -import java.util.Arrays; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.apache.iceberg.util.ByteBuffers; - -public class IcebergBinaryObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements BinaryObjectInspector, WriteObjectInspector { - - private static final IcebergBinaryObjectInspector INSTANCE = new IcebergBinaryObjectInspector(); - - public static IcebergBinaryObjectInspector get() { - return INSTANCE; - } - - private IcebergBinaryObjectInspector() { - super(TypeInfoFactory.binaryTypeInfo); - } - - @Override - public byte[] getPrimitiveJavaObject(Object o) { - return ByteBuffers.toByteArray((ByteBuffer) o); - } - - @Override - public BytesWritable getPrimitiveWritableObject(Object o) { - return o == null ? null : new BytesWritable(getPrimitiveJavaObject(o)); - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - if (o instanceof byte[]) { - byte[] bytes = (byte[]) o; - return Arrays.copyOf(bytes, bytes.length); - } else if (o instanceof ByteBuffer) { - ByteBuffer copy = - ByteBuffer.wrap( - ((ByteBuffer) o).array(), ((ByteBuffer) o).arrayOffset(), ((ByteBuffer) o).limit()); - return copy; - } else { - return o; - } - } - - @Override - public ByteBuffer convert(Object o) { - return o == null ? null : ByteBuffer.wrap((byte[]) o); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java deleted file mode 100644 index 17a82f430208..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.sql.Date; -import java.time.LocalDate; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.iceberg.util.DateTimeUtil; - -public final class IcebergDateObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements DateObjectInspector, WriteObjectInspector { - - private static final IcebergDateObjectInspector INSTANCE = new IcebergDateObjectInspector(); - - public static IcebergDateObjectInspector get() { - return INSTANCE; - } - - private IcebergDateObjectInspector() { - super(TypeInfoFactory.dateTypeInfo); - } - - @Override - public Date getPrimitiveJavaObject(Object o) { - return o == null ? null : Date.valueOf((LocalDate) o); - } - - @Override - public DateWritable getPrimitiveWritableObject(Object o) { - return o == null ? null : new DateWritable(DateTimeUtil.daysFromDate((LocalDate) o)); - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - - if (o instanceof Date) { - return new Date(((Date) o).getTime()); - } else if (o instanceof LocalDate) { - return LocalDate.of( - ((LocalDate) o).getYear(), ((LocalDate) o).getMonth(), ((LocalDate) o).getDayOfMonth()); - } else { - return o; - } - } - - @Override - public LocalDate convert(Object o) { - return o == null ? null : ((Date) o).toLocalDate(); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java deleted file mode 100644 index 20d52ffa5559..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import java.math.BigDecimal; -import java.util.concurrent.TimeUnit; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public final class IcebergDecimalObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements HiveDecimalObjectInspector, WriteObjectInspector { - - private static final Cache CACHE = - Caffeine.newBuilder().expireAfterAccess(10, TimeUnit.MINUTES).build(); - - public static IcebergDecimalObjectInspector get(int precision, int scale) { - Preconditions.checkArgument(scale < precision); - Preconditions.checkArgument(precision <= HiveDecimal.MAX_PRECISION); - Preconditions.checkArgument(scale <= HiveDecimal.MAX_SCALE); - - Integer key = precision << 8 | scale; - return CACHE.get(key, k -> new IcebergDecimalObjectInspector(precision, scale)); - } - - private IcebergDecimalObjectInspector(int precision, int scale) { - super(new DecimalTypeInfo(precision, scale)); - } - - @Override - public HiveDecimal getPrimitiveJavaObject(Object o) { - return o == null ? null : HiveDecimal.create((BigDecimal) o); - } - - @Override - public HiveDecimalWritable getPrimitiveWritableObject(Object o) { - HiveDecimal decimal = getPrimitiveJavaObject(o); - return decimal == null ? null : new HiveDecimalWritable(decimal); - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - - if (o instanceof HiveDecimal) { - HiveDecimal decimal = (HiveDecimal) o; - return HiveDecimal.create(decimal.bigDecimalValue()); - } else if (o instanceof BigDecimal) { - BigDecimal copy = new BigDecimal(o.toString()); - return copy; - } else { - return o; - } - } - - @Override - public BigDecimal convert(Object o) { - if (o == null) { - return null; - } - - BigDecimal result = ((HiveDecimal) o).bigDecimalValue(); - // during the HiveDecimal to BigDecimal conversion the scale is lost, when the value is 0 - result = result.setScale(scale()); - return result; - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java deleted file mode 100644 index 87dbfb1fbd84..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.util.Arrays; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; - -public class IcebergFixedObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements BinaryObjectInspector, WriteObjectInspector { - - private static final IcebergFixedObjectInspector INSTANCE = new IcebergFixedObjectInspector(); - - public static IcebergFixedObjectInspector get() { - return INSTANCE; - } - - private IcebergFixedObjectInspector() { - super(TypeInfoFactory.binaryTypeInfo); - } - - @Override - public byte[] getPrimitiveJavaObject(Object o) { - return (byte[]) o; - } - - @Override - public BytesWritable getPrimitiveWritableObject(Object o) { - return o == null ? null : new BytesWritable(getPrimitiveJavaObject(o)); - } - - @Override - public byte[] convert(Object o) { - return o == null ? null : (byte[]) o; - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - if (o instanceof byte[]) { - byte[] bytes = (byte[]) o; - return Arrays.copyOf(bytes, bytes.length); - } else { - return o; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java deleted file mode 100644 index 8be9a586d553..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.util.List; -import javax.annotation.Nullable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.iceberg.Schema; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -public final class IcebergObjectInspector extends TypeUtil.SchemaVisitor { - - // get the correct inspectors depending on whether we're working with Hive2 or Hive3 dependencies - // we need to do this because there is a breaking API change in Date/TimestampObjectInspector - // between Hive2 and Hive3 - private static final String DATE_INSPECTOR_CLASS = - HiveVersion.min(HiveVersion.HIVE_3) - ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3" - : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector"; - - public static final ObjectInspector DATE_INSPECTOR = - DynMethods.builder("get").impl(DATE_INSPECTOR_CLASS).buildStatic().invoke(); - - private static final String TIMESTAMP_INSPECTOR_CLASS = - HiveVersion.min(HiveVersion.HIVE_3) - ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspectorHive3" - : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspector"; - - private static final String TIMESTAMPTZ_INSPECTOR_CLASS = - HiveVersion.min(HiveVersion.HIVE_3) - ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspectorHive3" - : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspector"; - - public static final ObjectInspector TIMESTAMP_INSPECTOR = - DynMethods.builder("get").impl(TIMESTAMP_INSPECTOR_CLASS).buildStatic().invoke(); - - public static final ObjectInspector TIMESTAMP_INSPECTOR_WITH_TZ = - DynMethods.builder("get").impl(TIMESTAMPTZ_INSPECTOR_CLASS).buildStatic().invoke(); - - public static ObjectInspector create(@Nullable Schema schema) { - if (schema == null) { - return IcebergRecordObjectInspector.empty(); - } - - return TypeUtil.visit(schema, new IcebergObjectInspector()); - } - - public static ObjectInspector create(Types.NestedField... fields) { - return create(new Schema(fields)); - } - - @Override - public ObjectInspector field(Types.NestedField field, ObjectInspector fieldObjectInspector) { - return fieldObjectInspector; - } - - @Override - public ObjectInspector list(Types.ListType listTypeInfo, ObjectInspector listObjectInspector) { - return ObjectInspectorFactory.getStandardListObjectInspector(listObjectInspector); - } - - @Override - public ObjectInspector map( - Types.MapType mapType, - ObjectInspector keyObjectInspector, - ObjectInspector valueObjectInspector) { - return ObjectInspectorFactory.getStandardMapObjectInspector( - keyObjectInspector, valueObjectInspector); - } - - @Override - public ObjectInspector primitive(Type.PrimitiveType primitiveType) { - final PrimitiveTypeInfo primitiveTypeInfo; - - switch (primitiveType.typeId()) { - case BINARY: - return IcebergBinaryObjectInspector.get(); - case BOOLEAN: - primitiveTypeInfo = TypeInfoFactory.booleanTypeInfo; - break; - case DATE: - return DATE_INSPECTOR; - case DECIMAL: - Types.DecimalType type = (Types.DecimalType) primitiveType; - return IcebergDecimalObjectInspector.get(type.precision(), type.scale()); - case DOUBLE: - primitiveTypeInfo = TypeInfoFactory.doubleTypeInfo; - break; - case FIXED: - return IcebergFixedObjectInspector.get(); - case FLOAT: - primitiveTypeInfo = TypeInfoFactory.floatTypeInfo; - break; - case INTEGER: - primitiveTypeInfo = TypeInfoFactory.intTypeInfo; - break; - case LONG: - primitiveTypeInfo = TypeInfoFactory.longTypeInfo; - break; - case STRING: - primitiveTypeInfo = TypeInfoFactory.stringTypeInfo; - break; - case UUID: - return IcebergUUIDObjectInspector.get(); - case TIMESTAMP: - boolean adjustToUTC = ((Types.TimestampType) primitiveType).shouldAdjustToUTC(); - return adjustToUTC ? TIMESTAMP_INSPECTOR_WITH_TZ : TIMESTAMP_INSPECTOR; - case TIME: - return IcebergTimeObjectInspector.get(); - default: - throw new IllegalArgumentException(primitiveType.typeId() + " type is not supported"); - } - - return PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(primitiveTypeInfo); - } - - @Override - public ObjectInspector schema(Schema schema, ObjectInspector structObjectInspector) { - return structObjectInspector; - } - - @Override - public ObjectInspector struct( - Types.StructType structType, List fieldObjectInspectors) { - return new IcebergRecordObjectInspector(structType, fieldObjectInspectors); - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java deleted file mode 100644 index aaa09e51cf23..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; - -public final class IcebergRecordObjectInspector extends StructObjectInspector { - - private static final IcebergRecordObjectInspector EMPTY = - new IcebergRecordObjectInspector(Types.StructType.of(), Collections.emptyList()); - - private final List structFields; - - public IcebergRecordObjectInspector( - Types.StructType structType, List objectInspectors) { - Preconditions.checkArgument(structType.fields().size() == objectInspectors.size()); - - this.structFields = Lists.newArrayListWithExpectedSize(structType.fields().size()); - - int position = 0; - - for (Types.NestedField field : structType.fields()) { - ObjectInspector oi = objectInspectors.get(position); - Types.NestedField fieldInLowercase = - Types.NestedField.of( - field.fieldId(), - field.isOptional(), - field.name().toLowerCase(Locale.ROOT), - field.type(), - field.doc()); - IcebergRecordStructField structField = - new IcebergRecordStructField(fieldInLowercase, oi, position); - structFields.add(structField); - position++; - } - } - - public static IcebergRecordObjectInspector empty() { - return EMPTY; - } - - @Override - public List getAllStructFieldRefs() { - return structFields; - } - - @Override - public StructField getStructFieldRef(String name) { - return ObjectInspectorUtils.getStandardStructFieldRef(name, structFields); - } - - @Override - public Object getStructFieldData(Object o, StructField structField) { - if (o == null) { - return null; - } - - return ((Record) o).get(((IcebergRecordStructField) structField).position()); - } - - @Override - public List getStructFieldsDataAsList(Object o) { - if (o == null) { - return null; - } - - Record record = (Record) o; - return structFields.stream().map(f -> record.get(f.position())).collect(Collectors.toList()); - } - - @Override - public String getTypeName() { - return ObjectInspectorUtils.getStandardStructTypeName(this); - } - - @Override - public Category getCategory() { - return Category.STRUCT; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - IcebergRecordObjectInspector that = (IcebergRecordObjectInspector) o; - return structFields.equals(that.structFields); - } - - @Override - public int hashCode() { - return structFields.hashCode(); - } - - private static class IcebergRecordStructField implements StructField { - - private final Types.NestedField field; - private final ObjectInspector oi; - private final int position; - - IcebergRecordStructField(Types.NestedField field, ObjectInspector oi, int position) { - this.field = field; - this.oi = oi; - this.position = position; // position in the record - } - - @Override - public String getFieldName() { - return field.name(); - } - - @Override - public ObjectInspector getFieldObjectInspector() { - return oi; - } - - @Override - public int getFieldID() { - return field.fieldId(); - } - - @Override - public String getFieldComment() { - return field.doc(); - } - - int position() { - return position; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - IcebergRecordStructField that = (IcebergRecordStructField) o; - return field.equals(that.field) && oi.equals(that.oi) && position == that.position; - } - - @Override - public int hashCode() { - return Objects.hash(field, oi, position); - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java deleted file mode 100644 index a2e311489fc3..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.time.LocalTime; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.Text; - -public class IcebergTimeObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements StringObjectInspector, WriteObjectInspector { - - private static final IcebergTimeObjectInspector INSTANCE = new IcebergTimeObjectInspector(); - - private IcebergTimeObjectInspector() { - super(TypeInfoFactory.stringTypeInfo); - } - - public static IcebergTimeObjectInspector get() { - return INSTANCE; - } - - @Override - public String getPrimitiveJavaObject(Object o) { - return o == null ? null : o.toString(); - } - - @Override - public Text getPrimitiveWritableObject(Object o) { - String value = getPrimitiveJavaObject(o); - return value == null ? null : new Text(value); - } - - @Override - public LocalTime convert(Object o) { - return o == null ? null : LocalTime.parse((String) o); - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - - if (o instanceof Text) { - return new Text((Text) o); - } else { - return o; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java deleted file mode 100644 index 08c74c9afa4a..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.sql.Timestamp; -import java.time.LocalDateTime; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -public class IcebergTimestampObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements TimestampObjectInspector, WriteObjectInspector { - - private static final IcebergTimestampObjectInspector INSTANCE = - new IcebergTimestampObjectInspector(); - - public static IcebergTimestampObjectInspector get() { - return INSTANCE; - } - - private IcebergTimestampObjectInspector() { - super(TypeInfoFactory.timestampTypeInfo); - } - - @Override - public LocalDateTime convert(Object o) { - return o == null ? null : ((Timestamp) o).toLocalDateTime(); - } - - @Override - public Timestamp getPrimitiveJavaObject(Object o) { - return o == null ? null : Timestamp.valueOf((LocalDateTime) o); - } - - @Override - public TimestampWritable getPrimitiveWritableObject(Object o) { - Timestamp ts = getPrimitiveJavaObject(o); - return ts == null ? null : new TimestampWritable(ts); - } - - @Override - public Object copyObject(Object o) { - if (o instanceof Timestamp) { - Timestamp ts = (Timestamp) o; - Timestamp copy = new Timestamp(ts.getTime()); - copy.setNanos(ts.getNanos()); - return copy; - } else if (o instanceof LocalDateTime) { - LocalDateTime ldt = (LocalDateTime) o; - return LocalDateTime.of(ldt.toLocalDate(), ldt.toLocalTime()); - } else { - return o; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java deleted file mode 100644 index f315b0b6d8ea..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.sql.Timestamp; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -public class IcebergTimestampWithZoneObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements TimestampObjectInspector, WriteObjectInspector { - - private static final IcebergTimestampWithZoneObjectInspector INSTANCE = - new IcebergTimestampWithZoneObjectInspector(); - - public static IcebergTimestampWithZoneObjectInspector get() { - return INSTANCE; - } - - private IcebergTimestampWithZoneObjectInspector() { - super(TypeInfoFactory.timestampTypeInfo); - } - - @Override - public OffsetDateTime convert(Object o) { - return o == null ? null : OffsetDateTime.ofInstant(((Timestamp) o).toInstant(), ZoneOffset.UTC); - } - - @Override - public Timestamp getPrimitiveJavaObject(Object o) { - return o == null ? null : Timestamp.from(((OffsetDateTime) o).toInstant()); - } - - @Override - public TimestampWritable getPrimitiveWritableObject(Object o) { - Timestamp ts = getPrimitiveJavaObject(o); - return ts == null ? null : new TimestampWritable(ts); - } - - @Override - public Object copyObject(Object o) { - if (o instanceof Timestamp) { - Timestamp ts = (Timestamp) o; - Timestamp copy = new Timestamp(ts.getTime()); - copy.setNanos(ts.getNanos()); - return copy; - } else if (o instanceof OffsetDateTime) { - OffsetDateTime odt = (OffsetDateTime) o; - return OffsetDateTime.ofInstant(odt.toInstant(), odt.getOffset()); - } else { - return o; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java deleted file mode 100644 index 21ac71f72cfb..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import java.util.UUID; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.Text; - -public class IcebergUUIDObjectInspector extends AbstractPrimitiveJavaObjectInspector - implements StringObjectInspector, WriteObjectInspector { - - private static final IcebergUUIDObjectInspector INSTANCE = new IcebergUUIDObjectInspector(); - - private IcebergUUIDObjectInspector() { - super(TypeInfoFactory.stringTypeInfo); - } - - public static IcebergUUIDObjectInspector get() { - return INSTANCE; - } - - @Override - public String getPrimitiveJavaObject(Object o) { - return o == null ? null : o.toString(); - } - - @Override - public Text getPrimitiveWritableObject(Object o) { - String value = getPrimitiveJavaObject(o); - return value == null ? null : new Text(value); - } - - @Override - public UUID convert(Object o) { - return o == null ? null : UUID.fromString(o.toString()); - } - - @Override - public Object copyObject(Object o) { - if (o == null) { - return null; - } - - if (o instanceof Text) { - return new Text((Text) o); - } else { - return o; - } - } -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java deleted file mode 100644 index a6e112335fe4..000000000000 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -/** - * Interface for converting the Hive primitive objects for the objects which could be added to an - * Iceberg Record. If the IcebergObjectInspector does not implement this then the default Hive - * primitive objects will be used without conversion. - */ -public interface WriteObjectInspector { - Object convert(Object value); -} diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java index 492729d97338..1a86fda026c4 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java @@ -19,12 +19,12 @@ package org.apache.iceberg.mr.mapreduce; import java.io.IOException; +import java.io.Serializable; import java.io.UncheckedIOException; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.function.BiFunction; @@ -50,7 +50,6 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableScan; import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.common.DynMethods; import org.apache.iceberg.data.DeleteFilter; import org.apache.iceberg.data.GenericDeleteFilter; import org.apache.iceberg.data.IdentityPartitionConverters; @@ -63,7 +62,7 @@ import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hive.HiveVersion; +import org.apache.iceberg.hadoop.HadoopConfigurable; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; @@ -71,7 +70,6 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.mr.Catalogs; import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.hive.HiveIcebergStorageHandler; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -103,10 +101,7 @@ public static InputFormatConfig.ConfigBuilder configure(Job job) { @Override public List getSplits(JobContext context) { Configuration conf = context.getConfiguration(); - Table table = - Optional.ofNullable( - HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))) - .orElseGet(() -> Catalogs.loadTable(conf)); + Table table = Catalogs.loadTable(conf); final ExecutorService workerPool = ThreadPools.newFixedThreadPool( "iceberg-plan-worker-pool", @@ -157,20 +152,11 @@ private List planInputSplits( } List splits = Lists.newArrayList(); - boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false); - InputFormatConfig.InMemoryDataModel model = - conf.getEnum( - InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC); scan = scan.planWith(workerPool); try (CloseableIterable tasksIterable = scan.planTasks()) { Table serializableTable = SerializableTable.copyOf(table); tasksIterable.forEach( task -> { - if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE)) { - // TODO: We do not support residual evaluation for HIVE and PIG in memory data model - // yet - checkResiduals(task); - } splits.add(new IcebergSplit(serializableTable, conf, task)); }); } catch (IOException e) { @@ -183,25 +169,49 @@ private List planInputSplits( // wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for // standard queries if (scan instanceof DataTableScan) { - HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table); + checkAndSkipIoConfigSerialization(conf, table); } return splits; } - private static void checkResiduals(CombinedScanTask task) { - task.files() - .forEach( - fileScanTask -> { - Expression residual = fileScanTask.residual(); - if (residual != null && !residual.equals(Expressions.alwaysTrue())) { - throw new UnsupportedOperationException( - String.format( - "Filter expression %s is not completely satisfied. Additional rows " - + "can be returned not satisfied by the filter expression", - residual)); - } - }); + /** + * If enabled, it ensures that the FileIO's hadoop configuration will not be serialized. This + * might be desirable for decreasing the overall size of serialized table objects. + * + *

Note: Skipping FileIO config serialization in this fashion might in turn necessitate calling + * {@link #checkAndSetIoConfig(Configuration, Table)} on the deserializer-side to enable + * subsequent use of the FileIO. + * + * @param config Configuration to set for FileIO in a transient manner, if enabled + * @param table The Iceberg table object + */ + private void checkAndSkipIoConfigSerialization(Configuration config, Table table) { + if (table != null + && config.getBoolean( + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) + && table.io() instanceof HadoopConfigurable) { + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new NonSerializingConfig(config)::get); + } + } + + /** + * If enabled, it populates the FileIO's hadoop configuration with the input config object. This + * might be necessary when the table object was serialized without the FileIO config. + * + * @param config Configuration to set for FileIO, if enabled + * @param table The Iceberg table object + */ + private static void checkAndSetIoConfig(Configuration config, Table table) { + if (table != null + && config.getBoolean( + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) + && table.io() instanceof HadoopConfigurable) { + ((HadoopConfigurable) table.io()).setConf(config); + } } @Override @@ -209,27 +219,25 @@ public RecordReader createRecordReader(InputSplit split, TaskAttemptCon return new IcebergRecordReader<>(); } - private static final class IcebergRecordReader extends RecordReader { + private static class NonSerializingConfig implements Serializable { - private static final String HIVE_VECTORIZED_READER_CLASS = - "org.apache.iceberg.mr.hive.vector.HiveVectorizedReader"; - private static final DynMethods.StaticMethod HIVE_VECTORIZED_READER_BUILDER; - - static { - if (HiveVersion.min(HiveVersion.HIVE_3)) { - HIVE_VECTORIZED_READER_BUILDER = - DynMethods.builder("reader") - .impl( - HIVE_VECTORIZED_READER_CLASS, - InputFile.class, - FileScanTask.class, - Map.class, - TaskAttemptContext.class) - .buildStatic(); - } else { - HIVE_VECTORIZED_READER_BUILDER = null; + private final transient Configuration conf; + + NonSerializingConfig(Configuration conf) { + this.conf = conf; + } + + public Configuration get() { + if (conf == null) { + throw new IllegalStateException( + "Configuration was not serialized on purpose but was not set manually either"); } + + return conf; } + } + + private static final class IcebergRecordReader extends RecordReader { private TaskAttemptContext context; private Schema tableSchema; @@ -237,7 +245,6 @@ private static final class IcebergRecordReader extends RecordReader private String nameMapping; private boolean reuseContainers; private boolean caseSensitive; - private InputFormatConfig.InMemoryDataModel inMemoryDataModel; private Iterator tasks; private T current; private CloseableIterator currentIterator; @@ -252,7 +259,7 @@ public void initialize(InputSplit split, TaskAttemptContext newContext) { CombinedScanTask task = ((IcebergSplit) split).task(); this.context = newContext; Table table = ((IcebergSplit) split).table(); - HiveIcebergStorageHandler.checkAndSetIoConfig(conf, table); + checkAndSetIoConfig(conf, table); this.io = table.io(); this.encryptionManager = table.encryption(); this.tasks = task.files().iterator(); @@ -263,9 +270,6 @@ public void initialize(InputSplit split, TaskAttemptContext newContext) { InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT); this.expectedSchema = readSchema(conf, tableSchema, caseSensitive); this.reuseContainers = conf.getBoolean(InputFormatConfig.REUSE_CONTAINERS, false); - this.inMemoryDataModel = - conf.getEnum( - InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC); this.currentIterator = open(tasks.next(), expectedSchema).iterator(); } @@ -343,16 +347,9 @@ private CloseableIterable openTask(FileScanTask currentTask, Schema readSchem @SuppressWarnings("unchecked") private CloseableIterable open(FileScanTask currentTask, Schema readSchema) { - switch (inMemoryDataModel) { - case HIVE: - return openTask(currentTask, readSchema); - case GENERIC: - DeleteFilter deletes = new GenericDeleteFilter(io, currentTask, tableSchema, readSchema); - Schema requiredSchema = deletes.requiredSchema(); - return deletes.filter(openTask(currentTask, requiredSchema)); - default: - throw new UnsupportedOperationException("Unsupported memory model"); - } + DeleteFilter deletes = new GenericDeleteFilter(io, currentTask, tableSchema, readSchema); + Schema requiredSchema = deletes.requiredSchema(); + return deletes.filter(openTask(currentTask, requiredSchema)); } private CloseableIterable applyResidualFiltering( @@ -383,59 +380,36 @@ private CloseableIterable newAvroIterable( avroReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } - switch (inMemoryDataModel) { - case HIVE: - // TODO implement value readers for Pig and Hive - throw new UnsupportedOperationException( - "Avro support not yet supported for Pig and Hive"); - case GENERIC: - avroReadBuilder.createReaderFunc( - (expIcebergSchema, expAvroSchema) -> - DataReader.create( - expIcebergSchema, - expAvroSchema, - constantsMap(task, IdentityPartitionConverters::convertConstant))); - } + avroReadBuilder.createReaderFunc( + (expIcebergSchema, expAvroSchema) -> + DataReader.create( + expIcebergSchema, + expAvroSchema, + constantsMap(task, IdentityPartitionConverters::convertConstant))); return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema); } private CloseableIterable newParquetIterable( InputFile inputFile, FileScanTask task, Schema readSchema) { - Map idToConstant = - constantsMap(task, IdentityPartitionConverters::convertConstant); - CloseableIterable parquetIterator = null; - - switch (inMemoryDataModel) { - case HIVE: - if (HiveVersion.min(HiveVersion.HIVE_3)) { - parquetIterator = - HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); - } else { - throw new UnsupportedOperationException( - "Vectorized read is unsupported for Hive 2 integration."); - } - break; - case GENERIC: - Parquet.ReadBuilder parquetReadBuilder = - Parquet.read(inputFile) - .project(readSchema) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); - if (reuseContainers) { - parquetReadBuilder.reuseContainers(); - } - if (nameMapping != null) { - parquetReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - parquetReadBuilder.createReaderFunc( - fileSchema -> - GenericParquetReaders.buildReader( - readSchema, - fileSchema, - constantsMap(task, IdentityPartitionConverters::convertConstant))); - parquetIterator = parquetReadBuilder.build(); + Parquet.ReadBuilder parquetReadBuilder = + Parquet.read(inputFile) + .project(readSchema) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .split(task.start(), task.length()); + if (reuseContainers) { + parquetReadBuilder.reuseContainers(); + } + if (nameMapping != null) { + parquetReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } + parquetReadBuilder.createReaderFunc( + fileSchema -> + GenericParquetReaders.buildReader( + readSchema, + fileSchema, + constantsMap(task, IdentityPartitionConverters::convertConstant))); + CloseableIterable parquetIterator = parquetReadBuilder.build(); return applyResidualFiltering(parquetIterator, task.residual(), readSchema); } @@ -447,34 +421,20 @@ private CloseableIterable newOrcIterable( TypeUtil.selectNot( readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - CloseableIterable orcIterator = null; // ORC does not support reuse containers yet - switch (inMemoryDataModel) { - case HIVE: - if (HiveVersion.min(HiveVersion.HIVE_3)) { - orcIterator = - HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); - } else { - throw new UnsupportedOperationException( - "Vectorized read is unsupported for Hive 2 integration."); - } - break; - case GENERIC: - ORC.ReadBuilder orcReadBuilder = - ORC.read(inputFile) - .project(readSchemaWithoutConstantAndMetadataFields) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); - orcReadBuilder.createReaderFunc( - fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema, idToConstant)); - - if (nameMapping != null) { - orcReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - orcIterator = orcReadBuilder.build(); - } + ORC.ReadBuilder orcReadBuilder = + ORC.read(inputFile) + .project(readSchemaWithoutConstantAndMetadataFields) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .split(task.start(), task.length()); + orcReadBuilder.createReaderFunc( + fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema, idToConstant)); + if (nameMapping != null) { + orcReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + CloseableIterable orcIterator = orcReadBuilder.build(); return applyResidualFiltering(orcIterator, task.residual(), readSchema); } diff --git a/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java b/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java index 668703cc5d92..ce588a7e83e2 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java +++ b/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java @@ -20,7 +20,6 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -203,32 +202,6 @@ public void testResiduals() throws Exception { testInputFormat.create(builder.conf()).validate(writeRecords); } - @TestTemplate - public void testFailedResidualFiltering() throws Exception { - helper.createTable(); - - List expectedRecords = helper.generateRandomRecords(2, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - expectedRecords.get(1).set(2, "2020-03-20"); - - helper.appendToTable(Row.of("2020-03-20", 0), expectedRecords); - - builder - .useHiveRows() - .filter( - Expressions.and(Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 0))); - - assertThatThrownBy(() -> testInputFormat.create(builder.conf())) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage( - "Filter expression ref(name=\"id\") == 0 is not completely satisfied. Additional rows can be returned not satisfied by the filter expression"); - - assertThatThrownBy(() -> testInputFormat.create(builder.conf())) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage( - "Filter expression ref(name=\"id\") == 0 is not completely satisfied. Additional rows can be returned not satisfied by the filter expression"); - } - @TestTemplate public void testProjection() throws Exception { helper.createTable(); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java deleted file mode 100644 index 72b5034051da..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.types.Types; -import org.apache.orc.OrcConf; - -public class HiveIcebergStorageHandlerTestUtils { - static final FileFormat[] FILE_FORMATS = - new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}; - - static final Schema CUSTOMER_SCHEMA = - new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "first_name", Types.StringType.get(), "This is first name"), - optional(3, "last_name", Types.StringType.get(), "This is last name")); - - static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = - new Schema( - optional(1, "CustomER_Id", Types.LongType.get()), - optional(2, "First_name", Types.StringType.get()), - optional(3, "Last_name", Types.StringType.get())); - - static final List CUSTOMER_RECORDS = - TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA) - .add(0L, "Alice", "Brown") - .add(1L, "Bob", "Green") - .add(2L, "Trudy", "Pink") - .build(); - - private HiveIcebergStorageHandlerTestUtils() { - // Empty constructor for the utility class - } - - static TestHiveShell shell() { - return shell(Collections.emptyMap()); - } - - static TestHiveShell shell(Map configs) { - TestHiveShell shell = new TestHiveShell(); - shell.setHiveConfValue("hive.notification.event.poll.interval", "-1"); - shell.setHiveConfValue("hive.tez.exec.print.summary", "true"); - configs.forEach((k, v) -> shell.setHiveConfValue(k, v)); - // We would like to make sure that ORC reading overrides this config, so reading Iceberg tables - // could work in - // systems (like Hive 3.2 and higher) where this value is set to true explicitly. - shell.setHiveConfValue(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), "true"); - shell.start(); - return shell; - } - - static TestTables testTables( - TestHiveShell shell, TestTables.TestTableType testTableType, Path temp) throws IOException { - return testTables(shell, testTableType, temp, Catalogs.ICEBERG_DEFAULT_CATALOG_NAME); - } - - static TestTables testTables( - TestHiveShell shell, TestTables.TestTableType testTableType, Path temp, String catalogName) - throws IOException { - return testTableType.instance(shell.metastore().hiveConf(), temp, catalogName); - } - - static void init(TestHiveShell shell, TestTables testTables, Path temp, String engine) { - shell.openSession(); - - for (Map.Entry property : testTables.properties().entrySet()) { - shell.setHiveSessionValue(property.getKey(), property.getValue()); - } - - shell.setHiveSessionValue("hive.execution.engine", engine); - shell.setHiveSessionValue("hive.jar.directory", temp.toAbsolutePath().toString()); - shell.setHiveSessionValue("tez.staging-dir", temp.toAbsolutePath().toString()); - } - - static void close(TestHiveShell shell) throws Exception { - shell.closeSession(); - shell.metastore().reset(); - // HiveServer2 thread pools are using thread local Hive -> HMSClient objects. These are not - // cleaned up when the - // HiveServer2 is stopped. Only Finalizer closes the HMS connections. - System.gc(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java deleted file mode 100644 index 4e1779411add..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.Timestamp; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.JobID; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; - -public class HiveIcebergTestUtils { - // TODO: Can this be a constant all around the Iceberg tests? - public static final Schema FULL_SCHEMA = - new Schema( - // TODO: Create tests for field case insensitivity. - optional(1, "boolean_type", Types.BooleanType.get()), - optional(2, "integer_type", Types.IntegerType.get()), - optional(3, "long_type", Types.LongType.get()), - optional(4, "float_type", Types.FloatType.get()), - optional(5, "double_type", Types.DoubleType.get()), - optional(6, "date_type", Types.DateType.get()), - optional(7, "tstz", Types.TimestampType.withZone()), - optional(8, "ts", Types.TimestampType.withoutZone()), - optional(9, "string_type", Types.StringType.get()), - optional(10, "fixed_type", Types.FixedType.ofLength(3)), - optional(11, "binary_type", Types.BinaryType.get()), - optional(12, "decimal_type", Types.DecimalType.of(38, 10)), - optional(13, "time_type", Types.TimeType.get()), - optional(14, "uuid_type", Types.UUIDType.get())); - - public static final StandardStructObjectInspector FULL_SCHEMA_OBJECT_INSPECTOR = - ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList( - "boolean_type", - "integer_type", - "long_type", - "float_type", - "double_type", - "date_type", - "tstz", - "ts", - "string_type", - "fixed_type", - "binary_type", - "decimal_type", - "time_type", - "uuid_type"), - Arrays.asList( - PrimitiveObjectInspectorFactory.writableBooleanObjectInspector, - PrimitiveObjectInspectorFactory.writableIntObjectInspector, - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableFloatObjectInspector, - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector, - PrimitiveObjectInspectorFactory.writableDateObjectInspector, - PrimitiveObjectInspectorFactory.writableTimestampObjectInspector, - PrimitiveObjectInspectorFactory.writableTimestampObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector, - PrimitiveObjectInspectorFactory.writableBinaryObjectInspector, - PrimitiveObjectInspectorFactory.writableBinaryObjectInspector, - PrimitiveObjectInspectorFactory.writableHiveDecimalObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector)); - - private HiveIcebergTestUtils() { - // Empty constructor for the utility class - } - - /** - * Generates a test record where every field has a value. - * - * @return Record with every field set - */ - public static Record getTestRecord() { - Record record = GenericRecord.create(HiveIcebergTestUtils.FULL_SCHEMA); - record.set(0, true); - record.set(1, 1); - record.set(2, 2L); - record.set(3, 3.1f); - record.set(4, 4.2d); - record.set(5, LocalDate.of(2020, 1, 21)); - // Nano is not supported ? - record.set(6, OffsetDateTime.of(2017, 11, 22, 11, 30, 7, 0, ZoneOffset.ofHours(2))); - record.set(7, LocalDateTime.of(2019, 2, 22, 9, 44, 54)); - record.set(8, "kilenc"); - record.set(9, new byte[] {0, 1, 2}); - record.set(10, ByteBuffer.wrap(new byte[] {0, 1, 2, 3})); - record.set(11, new BigDecimal("0.0000000013")); - record.set(12, LocalTime.of(11, 33)); - record.set(13, UUID.fromString("73689599-d7fc-4dfb-b94e-106ff20284a5")); - - return record; - } - - /** - * Record with every field set to null. - * - * @return Empty record - */ - public static Record getNullTestRecord() { - Record record = GenericRecord.create(HiveIcebergTestUtils.FULL_SCHEMA); - - for (int i = 0; i < HiveIcebergTestUtils.FULL_SCHEMA.columns().size(); i++) { - record.set(i, null); - } - - return record; - } - - /** - * Hive values for the test record. - * - * @param record The original Iceberg record - * @return The Hive 'record' containing the same values - */ - public static List valuesForTestRecord(Record record) { - return Arrays.asList( - new BooleanWritable(Boolean.TRUE), - new IntWritable(record.get(1, Integer.class)), - new LongWritable(record.get(2, Long.class)), - new FloatWritable(record.get(3, Float.class)), - new DoubleWritable(record.get(4, Double.class)), - new DateWritable((int) record.get(5, LocalDate.class).toEpochDay()), - new TimestampWritable(Timestamp.from(record.get(6, OffsetDateTime.class).toInstant())), - new TimestampWritable(Timestamp.valueOf(record.get(7, LocalDateTime.class))), - new Text(record.get(8, String.class)), - new BytesWritable(record.get(9, byte[].class)), - new BytesWritable(ByteBuffers.toByteArray(record.get(10, ByteBuffer.class))), - new HiveDecimalWritable(HiveDecimal.create(record.get(11, BigDecimal.class))), - new Text(record.get(12, LocalTime.class).toString()), - new Text(record.get(13, UUID.class).toString())); - } - - /** - * Converts a list of Object arrays to a list of Iceberg records. - * - * @param schema The schema of the Iceberg record - * @param rows The data of the records - * @return The list of the converted records - */ - public static List valueForRow(Schema schema, List rows) { - return rows.stream() - .map( - row -> { - Record record = GenericRecord.create(schema); - for (int i = 0; i < row.length; ++i) { - record.set(i, row[i]); - } - return record; - }) - .collect(Collectors.toList()); - } - - /** - * Check if 2 Iceberg records are the same or not. Compares OffsetDateTimes only by the Intant - * they represent. - * - * @param expected The expected record - * @param actual The actual record - */ - public static void assertEquals(Record expected, Record actual) { - for (int i = 0; i < expected.size(); ++i) { - if (expected.get(i) instanceof OffsetDateTime) { - // For OffsetDateTime we just compare the actual instant - assertThat(((OffsetDateTime) actual.get(i)).toInstant()) - .isEqualTo(((OffsetDateTime) expected.get(i)).toInstant()); - } else { - assertThat(actual.get(i)).isEqualTo(expected.get(i)); - } - } - } - - /** - * Validates whether the table contains the expected records. The results should be sorted by a - * unique key so we do not end up with flaky tests. - * - * @param table The table we should read the records from - * @param expected The expected list of Records - * @param sortBy The column position by which we will sort - * @throws IOException Exceptions when reading the table data - */ - public static void validateData(Table table, List expected, int sortBy) - throws IOException { - // Refresh the table, so we get the new data as well - table.refresh(); - List records = Lists.newArrayListWithExpectedSize(expected.size()); - try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { - iterable.forEach(records::add); - } - - validateData(expected, records, sortBy); - } - - /** - * Validates whether the 2 sets of records are the same. The results should be sorted by a unique - * key so we do not end up with flaky tests. - * - * @param expected The expected list of Records - * @param actual The actual list of Records - * @param sortBy The column position by which we will sort - */ - public static void validateData(List expected, List actual, int sortBy) { - List sortedExpected = Lists.newArrayList(expected); - List sortedActual = Lists.newArrayList(actual); - // Sort based on the specified column - sortedExpected.sort(Comparator.comparingLong(record -> (Long) record.get(sortBy))); - sortedActual.sort(Comparator.comparingLong(record -> (Long) record.get(sortBy))); - - assertThat(sortedActual).hasSameSizeAs(sortedExpected); - for (int i = 0; i < sortedExpected.size(); ++i) { - assertEquals(sortedExpected.get(i), sortedActual.get(i)); - } - } - - /** - * Validates the number of files under a {@link Table} generated by a specific queryId and jobId. - * Validates that the commit files are removed. - * - * @param table The table we are checking - * @param conf The configuration used for generating the job location - * @param jobId The jobId which generated the files - * @param dataFileNum The expected number of data files (TABLE_LOCATION/data/*) - */ - public static void validateFiles(Table table, Configuration conf, JobID jobId, int dataFileNum) - throws IOException { - List dataFiles; - try (Stream files = Files.walk(Paths.get(table.location() + "/data"))) { - dataFiles = - files - .filter(Files::isRegularFile) - .filter(path -> !path.getFileName().toString().startsWith(".")) - .collect(Collectors.toList()); - } - - assertThat(dataFiles).hasSize(dataFileNum); - assertThat( - new File(HiveIcebergOutputCommitter.generateJobLocation(table.location(), conf, jobId))) - .doesNotExist(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java deleted file mode 100644 index 8f58a36d6265..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.Arrays; -import java.util.Collections; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.MapWritable; -import org.apache.hadoop.io.Text; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestDeserializer { - private static final Schema CUSTOMER_SCHEMA = - new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "first_name", Types.StringType.get())); - - private static final StandardStructObjectInspector CUSTOMER_OBJECT_INSPECTOR = - ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("customer_id", "first_name"), - Arrays.asList( - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector)); - - @Test - public void testSchemaDeserialize() { - StandardStructObjectInspector schemaObjectInspector = - ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("0:col1", "1:col2"), - Arrays.asList( - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector)); - - Deserializer deserializer = - new Deserializer.Builder() - .schema(CUSTOMER_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) - .sourceInspector(schemaObjectInspector) - .build(); - - Record expected = GenericRecord.create(CUSTOMER_SCHEMA); - expected.set(0, 1L); - expected.set(1, "Bob"); - - Record actual = deserializer.deserialize(new Object[] {new LongWritable(1L), new Text("Bob")}); - - assertThat(actual).isEqualTo(expected); - } - - @Test - public void testStructDeserialize() { - Deserializer deserializer = - new Deserializer.Builder() - .schema(CUSTOMER_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) - .sourceInspector(CUSTOMER_OBJECT_INSPECTOR) - .build(); - - Record expected = GenericRecord.create(CUSTOMER_SCHEMA); - expected.set(0, 1L); - expected.set(1, "Bob"); - - Record actual = deserializer.deserialize(new Object[] {new LongWritable(1L), new Text("Bob")}); - - assertThat(actual).isEqualTo(expected); - } - - @Test - public void testMapDeserialize() { - Schema schema = - new Schema( - optional( - 1, - "map_type", - Types.MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); - - StructObjectInspector inspector = - ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("map_type"), - Arrays.asList( - ObjectInspectorFactory.getStandardMapObjectInspector( - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector))); - - Deserializer deserializer = - new Deserializer.Builder() - .schema(schema) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) - .sourceInspector(inspector) - .build(); - - Record expected = GenericRecord.create(schema); - expected.set(0, Collections.singletonMap(1L, "Taylor")); - - MapWritable map = new MapWritable(); - map.put(new LongWritable(1L), new Text("Taylor")); - Object[] data = new Object[] {map}; - Record actual = deserializer.deserialize(data); - - assertThat(actual).isEqualTo(expected); - } - - @Test - public void testListDeserialize() { - Schema schema = - new Schema(optional(1, "list_type", Types.ListType.ofOptional(2, Types.LongType.get()))); - - StructObjectInspector inspector = - ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("list_type"), - Arrays.asList( - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableLongObjectInspector))); - - Deserializer deserializer = - new Deserializer.Builder() - .schema(schema) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) - .sourceInspector(inspector) - .build(); - - Record expected = GenericRecord.create(schema); - expected.set(0, Collections.singletonList(1L)); - - Object[] data = new Object[] {new Object[] {new LongWritable(1L)}}; - Record actual = deserializer.deserialize(data); - - assertThat(actual).isEqualTo(expected); - } - - @Test - public void testDeserializeEverySupportedType() { - assumeThat(HiveVersion.min(HiveVersion.HIVE_3)) - .as("No test yet for Hive3 (Date/Timestamp creation)") - .isFalse(); - - Deserializer deserializer = - new Deserializer.Builder() - .schema(HiveIcebergTestUtils.FULL_SCHEMA) - .writerInspector( - (StructObjectInspector) - IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) - .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) - .build(); - - Record expected = HiveIcebergTestUtils.getTestRecord(); - Record actual = deserializer.deserialize(HiveIcebergTestUtils.valuesForTestRecord(expected)); - - HiveIcebergTestUtils.assertEquals(expected, actual); - } - - @Test - public void testNullDeserialize() { - Deserializer deserializer = - new Deserializer.Builder() - .schema(HiveIcebergTestUtils.FULL_SCHEMA) - .writerInspector( - (StructObjectInspector) - IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) - .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) - .build(); - - Record expected = HiveIcebergTestUtils.getNullTestRecord(); - - Object[] nulls = new Object[HiveIcebergTestUtils.FULL_SCHEMA.columns().size()]; - Arrays.fill(nulls, null); - - Record actual = deserializer.deserialize(nulls); - - assertThat(actual).isEqualTo(expected); - - // Check null record as well - assertThat(deserializer.deserialize(null)).isNull(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java deleted file mode 100644 index 579a4b810c54..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.math.BigDecimal; -import java.sql.Date; -import java.sql.Timestamp; -import java.time.LocalDate; -import java.util.Collections; -import java.util.List; -import org.apache.hadoop.hive.ql.io.sarg.ExpressionTree; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.iceberg.expressions.And; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.Literal; -import org.apache.iceberg.expressions.Not; -import org.apache.iceberg.expressions.Or; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.jupiter.api.Test; - -public class TestHiveIcebergFilterFactory { - - @Test - public void testEqualsOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); - - UnboundPredicate expected = Expressions.equal("salary", 3000L); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testEqualsOperandRewrite() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, Double.NaN).end().build(); - - UnboundPredicate expected = Expressions.isNaN("float"); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testNotEqualsOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startNot().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); - - Not expected = (Not) Expressions.not(Expressions.equal("salary", 3000L)); - Not actual = (Not) HiveIcebergFilterFactory.generateFilterExpression(arg); - - UnboundPredicate childExpressionActual = (UnboundPredicate) actual.child(); - UnboundPredicate childExpressionExpected = Expressions.equal("salary", 3000L); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.child().op()).isEqualTo(actual.child().op()); - assertThat(childExpressionExpected.ref().name()).isEqualTo(childExpressionActual.ref().name()); - assertThat(childExpressionExpected.literal()).isEqualTo(childExpressionActual.literal()); - } - - @Test - public void testLessThanOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().lessThan("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); - - UnboundPredicate expected = Expressions.lessThan("salary", 3000L); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.literal()).isEqualTo(actual.literal()); - assertThat(expected.ref().name()).isEqualTo(actual.ref().name()); - } - - @Test - public void testLessThanEqualsOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().lessThanEquals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); - - UnboundPredicate expected = Expressions.lessThanOrEqual("salary", 3000L); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testInOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().in("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); - - UnboundPredicate expected = Expressions.in("salary", 3000L, 4000L); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.literals()).isEqualTo(actual.literals()); - assertThat(expected.ref().name()).isEqualTo(actual.ref().name()); - } - - @Test - public void testBetweenOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().between("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); - - And expected = - (And) - Expressions.and( - Expressions.greaterThanOrEqual("salary", 3000L), - Expressions.lessThanOrEqual("salary", 3000L)); - And actual = (And) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.left().op()).isEqualTo(actual.left().op()); - assertThat(expected.right().op()).isEqualTo(actual.right().op()); - } - - @Test - public void testUnsupportedBetweenOperandEmptyLeaves() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - final SearchArgument arg = - new MockSearchArgument( - builder - .startAnd() - .between("salary", PredicateLeaf.Type.LONG, 9000L, 15000L) - .end() - .build()); - assertThatThrownBy(() -> HiveIcebergFilterFactory.generateFilterExpression(arg)) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage("Missing leaf literals: Leaf[empty]"); - } - - @Test - public void testIsNullOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().isNull("salary", PredicateLeaf.Type.LONG).end().build(); - - UnboundPredicate expected = Expressions.isNull("salary"); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.ref().name()).isEqualTo(actual.ref().name()); - } - - @Test - public void testAndOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder - .startAnd() - .equals("salary", PredicateLeaf.Type.LONG, 3000L) - .equals("salary", PredicateLeaf.Type.LONG, 4000L) - .end() - .build(); - - And expected = - (And) - Expressions.and(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); - And actual = (And) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.left().op()).isEqualTo(actual.left().op()); - assertThat(expected.right().op()).isEqualTo(actual.right().op()); - } - - @Test - public void testOrOperand() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder - .startOr() - .equals("salary", PredicateLeaf.Type.LONG, 3000L) - .equals("salary", PredicateLeaf.Type.LONG, 4000L) - .end() - .build(); - - Or expected = - (Or) Expressions.or(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); - Or actual = (Or) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertThat(expected.op()).isEqualTo(actual.op()); - assertThat(expected.left().op()).isEqualTo(actual.left().op()); - assertThat(expected.right().op()).isEqualTo(actual.right().op()); - } - - @Test - public void testStringType() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("string", PredicateLeaf.Type.STRING, "Joe").end().build(); - - UnboundPredicate expected = Expressions.equal("string", "Joe"); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testFloatType() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, 1200D).end().build(); - - UnboundPredicate expected = Expressions.equal("float", 1200D); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testBooleanType() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("boolean", PredicateLeaf.Type.BOOLEAN, true).end().build(); - - UnboundPredicate expected = Expressions.equal("boolean", true); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testDateType() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - Date gmtDate = Date.valueOf(LocalDate.of(2015, 11, 12)); - SearchArgument arg = - builder.startAnd().equals("date", PredicateLeaf.Type.DATE, gmtDate).end().build(); - - UnboundPredicate expected = - Expressions.equal("date", Literal.of("2015-11-12").to(Types.DateType.get()).value()); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testTimestampType() { - Literal timestampLiteral = - Literal.of("2012-10-02T05:16:17.123456").to(Types.TimestampType.withoutZone()); - long timestampMicros = timestampLiteral.value(); - Timestamp ts = Timestamp.valueOf(DateTimeUtil.timestampFromMicros(timestampMicros)); - - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder.startAnd().equals("timestamp", PredicateLeaf.Type.TIMESTAMP, ts).end().build(); - - UnboundPredicate expected = Expressions.equal("timestamp", timestampMicros); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - @Test - public void testDecimalType() { - SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = - builder - .startAnd() - .equals("decimal", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("20.12")) - .end() - .build(); - - UnboundPredicate expected = Expressions.equal("decimal", new BigDecimal("20.12")); - UnboundPredicate actual = - (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); - - assertPredicatesMatch(expected, actual); - } - - private void assertPredicatesMatch(UnboundPredicate expected, UnboundPredicate actual) { - assertThat(actual.op()).isEqualTo(expected.op()); - assertThat(actual.literal()).isEqualTo(expected.literal()); - assertThat(actual.ref().name()).isEqualTo(expected.ref().name()); - } - - private static class MockSearchArgument implements SearchArgument { - - private final SearchArgument delegate; - - MockSearchArgument(SearchArgument original) { - delegate = original; - } - - @Override - public ExpressionTree getExpression() { - return delegate.getExpression(); - } - - @Override - public TruthValue evaluate(TruthValue[] leaves) { - return delegate.evaluate(leaves); - } - - @Override - public List getLeaves() { - return Collections.singletonList( - new PredicateLeaf() { - @Override - public Operator getOperator() { - return Operator.BETWEEN; - } - - @Override - public Type getType() { - return Type.LONG; - } - - @Override - public String getColumnName() { - return "salary"; - } - - @Override - public Object getLiteral() { - return null; - } - - @Override - public List getLiteralList() { - return Collections.emptyList(); - } - - @Override - public String toString() { - return "Leaf[empty]"; - } - }); - } - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java deleted file mode 100644 index 147e0ba4685a..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.mr.hive.HiveIcebergRecordWriter.getWriters; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.JobContextImpl; -import org.apache.hadoop.mapred.JobID; -import org.apache.hadoop.mapred.OutputCommitter; -import org.apache.hadoop.mapred.TaskAttemptContextImpl; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapreduce.JobStatus; -import org.apache.hadoop.mapreduce.TaskType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SerializationUtil; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.ArgumentCaptor; -import org.mockito.Mockito; - -public class TestHiveIcebergOutputCommitter { - private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - private static final int RECORD_NUM = 5; - private static final String QUERY_ID = "query_id"; - private static final JobID JOB_ID = new JobID("test", 0); - private static final TaskAttemptID MAP_TASK_ID = - new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, 0, 0); - private static final TaskAttemptID REDUCE_TASK_ID = - new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.REDUCE, 0, 0); - - private static final Schema CUSTOMER_SCHEMA = - new Schema( - required(1, "customer_id", Types.LongType.get()), - required(2, "first_name", Types.StringType.get())); - - private static final PartitionSpec PARTITIONED_SPEC = - PartitionSpec.builderFor(CUSTOMER_SCHEMA).bucket("customer_id", 3).build(); - - @TempDir private Path temp; - - @Test - public void testNeedsTaskCommit() { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - - JobConf mapOnlyJobConf = new JobConf(); - mapOnlyJobConf.setNumMapTasks(10); - mapOnlyJobConf.setNumReduceTasks(0); - - // Map only job should commit map tasks - assertThat(committer.needsTaskCommit(new TaskAttemptContextImpl(mapOnlyJobConf, MAP_TASK_ID))) - .isTrue(); - - JobConf mapReduceJobConf = new JobConf(); - mapReduceJobConf.setNumMapTasks(10); - mapReduceJobConf.setNumReduceTasks(10); - - // MapReduce job should not commit map tasks, but should commit reduce tasks - assertThat(committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, MAP_TASK_ID))) - .isFalse(); - assertThat( - committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, REDUCE_TASK_ID))) - .isTrue(); - } - - @Test - public void testSuccessfulUnpartitionedWrite() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), false); - - JobConf conf = jobConf(table, 1); - List expected = writeRecords(table.name(), 1, 0, true, false, conf); - committer.commitJob(new JobContextImpl(conf, JOB_ID)); - - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 1); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @Test - public void testSuccessfulPartitionedWrite() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), true); - JobConf conf = jobConf(table, 1); - List expected = writeRecords(table.name(), 1, 0, true, false, conf); - committer.commitJob(new JobContextImpl(conf, JOB_ID)); - - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 3); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @Test - public void testSuccessfulMultipleTasksUnpartitionedWrite() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), false); - JobConf conf = jobConf(table, 2); - List expected = writeRecords(table.name(), 2, 0, true, false, conf); - committer.commitJob(new JobContextImpl(conf, JOB_ID)); - - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 2); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @Test - public void testSuccessfulMultipleTasksPartitionedWrite() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), true); - JobConf conf = jobConf(table, 2); - List expected = writeRecords(table.name(), 2, 0, true, false, conf); - committer.commitJob(new JobContextImpl(conf, JOB_ID)); - - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 6); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @Test - public void testRetryTask() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), false); - JobConf conf = jobConf(table, 2); - - // Write records and abort the tasks - writeRecords(table.name(), 2, 0, false, true, conf); - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 0); - HiveIcebergTestUtils.validateData(table, Collections.emptyList(), 0); - - // Write records but do not abort the tasks - // The data files remain since we can not identify them but should not be read - writeRecords(table.name(), 2, 1, false, false, conf); - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 2); - HiveIcebergTestUtils.validateData(table, Collections.emptyList(), 0); - - // Write and commit the records - List expected = writeRecords(table.name(), 2, 2, true, false, conf); - committer.commitJob(new JobContextImpl(conf, JOB_ID)); - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 4); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @Test - public void testAbortJob() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - Table table = table(temp.toFile().getPath(), false); - JobConf conf = jobConf(table, 1); - writeRecords(table.name(), 1, 0, true, false, conf); - committer.abortJob(new JobContextImpl(conf, JOB_ID), JobStatus.State.FAILED); - - HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 0); - HiveIcebergTestUtils.validateData(table, Collections.emptyList(), 0); - } - - @Test - public void writerIsClosedAfterTaskCommitFailure() throws IOException { - HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); - HiveIcebergOutputCommitter failingCommitter = Mockito.spy(committer); - ArgumentCaptor argumentCaptor = - ArgumentCaptor.forClass(TaskAttemptContextImpl.class); - String exceptionMessage = "Commit task failed!"; - Mockito.doThrow(new RuntimeException(exceptionMessage)) - .when(failingCommitter) - .commitTask(argumentCaptor.capture()); - - Table table = table(temp.toFile().getPath(), false); - JobConf conf = jobConf(table, 1); - - assertThatThrownBy(() -> writeRecords(table.name(), 1, 0, true, false, conf, failingCommitter)) - .isInstanceOf(RuntimeException.class) - .hasMessage(exceptionMessage); - - assertThat(argumentCaptor.getAllValues()).hasSize(1); - TaskAttemptID capturedId = - TezUtil.taskAttemptWrapper(argumentCaptor.getValue().getTaskAttemptID()); - // writer is still in the map after commitTask failure - assertThat(getWriters(capturedId)).isNotNull(); - failingCommitter.abortTask(new TaskAttemptContextImpl(conf, capturedId)); - // abortTask succeeds and removes writer - assertThat(getWriters(capturedId)).isNull(); - } - - private Table table(String location, boolean partitioned) { - HadoopTables tables = new HadoopTables(); - - return tables.create( - CUSTOMER_SCHEMA, - partitioned ? PARTITIONED_SPEC : PartitionSpec.unpartitioned(), - ImmutableMap.of(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_HADOOP_TABLE_NAME), - location); - } - - private JobConf jobConf(Table table, int taskNum) { - JobConf conf = new JobConf(); - conf.setNumMapTasks(taskNum); - conf.setNumReduceTasks(0); - conf.set(HiveConf.ConfVars.HIVEQUERYID.varname, QUERY_ID); - conf.set(InputFormatConfig.OUTPUT_TABLES, table.name()); - conf.set( - InputFormatConfig.TABLE_CATALOG_PREFIX + table.name(), - table.properties().get(InputFormatConfig.CATALOG_NAME)); - conf.set( - InputFormatConfig.SERIALIZED_TABLE_PREFIX + table.name(), - SerializationUtil.serializeToBase64(table)); - - Map propMap = Maps.newHashMap(); - TableDesc tableDesc = new TableDesc(); - tableDesc.setProperties(new Properties()); - tableDesc.getProperties().setProperty(Catalogs.NAME, table.name()); - tableDesc.getProperties().setProperty(Catalogs.LOCATION, table.location()); - tableDesc - .getProperties() - .setProperty( - InputFormatConfig.CATALOG_NAME, table.properties().get(InputFormatConfig.CATALOG_NAME)); - HiveIcebergStorageHandler.overlayTableProperties(conf, tableDesc, propMap); - propMap.forEach((key, value) -> conf.set(key, value)); - return conf; - } - - /** - * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and a - * separate {@link HiveIcebergRecordWriter} for every task. - * - * @param name The name of the table to get the table object from the conf - * @param taskNum The number of tasks in the job handled by the committer - * @param attemptNum The id used for attempt number generation - * @param commitTasks If true the tasks will be committed - * @param abortTasks If true the tasks will be aborted - needed so we can simulate no - * commit/no abort situation - * @param conf The job configuration - * @param committer The output committer that should be used for committing/aborting the tasks - * @return The random generated records which were appended to the table - * @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions - */ - private List writeRecords( - String name, - int taskNum, - int attemptNum, - boolean commitTasks, - boolean abortTasks, - JobConf conf, - OutputCommitter committer) - throws IOException { - List expected = Lists.newArrayListWithExpectedSize(RECORD_NUM * taskNum); - - Table table = HiveIcebergStorageHandler.table(conf, name); - FileIO io = table.io(); - Schema schema = HiveIcebergStorageHandler.schema(conf); - PartitionSpec spec = table.spec(); - - for (int i = 0; i < taskNum; ++i) { - List records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum); - TaskAttemptID taskId = - new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum); - int partitionId = taskId.getTaskID().getId(); - String operationId = QUERY_ID + "-" + JOB_ID; - FileFormat fileFormat = FileFormat.PARQUET; - OutputFileFactory outputFileFactory = - OutputFileFactory.builderFor(table, partitionId, attemptNum) - .format(fileFormat) - .operationId(operationId) - .build(); - HiveIcebergRecordWriter testWriter = - new HiveIcebergRecordWriter( - schema, - spec, - fileFormat, - new GenericAppenderFactory(schema), - outputFileFactory, - io, - TARGET_FILE_SIZE, - TezUtil.taskAttemptWrapper(taskId), - conf.get(Catalogs.NAME)); - - Container container = new Container<>(); - - for (Record record : records) { - container.set(record); - testWriter.write(container); - } - - testWriter.close(false); - if (commitTasks) { - committer.commitTask(new TaskAttemptContextImpl(conf, taskId)); - expected.addAll(records); - } else if (abortTasks) { - committer.abortTask(new TaskAttemptContextImpl(conf, taskId)); - } - } - - return expected; - } - - private List writeRecords( - String name, - int taskNum, - int attemptNum, - boolean commitTasks, - boolean abortTasks, - JobConf conf) - throws IOException { - return writeRecords( - name, taskNum, attemptNum, commitTasks, abortTasks, conf, new HiveIcebergOutputCommitter()); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java deleted file mode 100644 index 3ca39c9fec36..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Properties; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector; -import org.apache.iceberg.mr.mapred.Container; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestHiveIcebergSerDe { - - private static final Schema SCHEMA = - new Schema(required(1, "string_field", Types.StringType.get())); - - @TempDir private Path tmp; - - @Test - public void testInitialize() throws IOException, SerDeException { - File location = tmp.toFile(); - assertThat(location.delete()).isTrue(); - - Configuration conf = new Configuration(); - - Properties properties = new Properties(); - properties.setProperty("location", location.toString()); - properties.setProperty(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_HADOOP_TABLE_NAME); - - HadoopTables tables = new HadoopTables(conf); - tables.create(SCHEMA, location.toString()); - - HiveIcebergSerDe serDe = new HiveIcebergSerDe(); - serDe.initialize(conf, properties); - - assertThat(serDe.getObjectInspector()).isEqualTo(IcebergObjectInspector.create(SCHEMA)); - } - - @Test - public void testDeserialize() { - HiveIcebergSerDe serDe = new HiveIcebergSerDe(); - - Record record = RandomGenericData.generate(SCHEMA, 1, 0).get(0); - Container container = new Container<>(); - container.set(record); - - assertThat(serDe.deserialize(container)).isEqualTo(record); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java deleted file mode 100644 index 9018d4518cbf..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java +++ /dev/null @@ -1,789 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.TestHelpers.Row; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestHiveIcebergStorageHandlerLocalScan { - - @Parameters(name = "fileFormat={0}, catalog={1}") - public static Collection parameters() { - Collection testParams = Lists.newArrayList(); - - // Run tests with every FileFormat for a single Catalog (HiveCatalog) - for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) { - testParams.add(new Object[] {fileFormat, TestTables.TestTableType.HIVE_CATALOG}); - } - - // Run tests for every Catalog for a single FileFormat (PARQUET) - skip HiveCatalog tests as - // they are added before - for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { - if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[] {FileFormat.PARQUET, testTableType}); - } - } - - return testParams; - } - - private static TestHiveShell shell; - - private TestTables testTables; - - @Parameter(index = 0) - private FileFormat fileFormat; - - @Parameter(index = 1) - private TestTables.TestTableType testTableType; - - @TempDir private Path temp; - - @BeforeAll - public static void beforeClass() { - shell = HiveIcebergStorageHandlerTestUtils.shell(); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution - // engines - HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - } - - @TestTemplate - public void testScanEmptyTable() throws IOException { - Schema emptySchema = new Schema(required(1, "empty", Types.StringType.get())); - testTables.createTable(shell, "empty", emptySchema, fileFormat, ImmutableList.of()); - - List rows = shell.executeStatement("SELECT * FROM default.empty"); - assertThat(rows).isEmpty(); - } - - @TestTemplate - public void testScanTable() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - // Single fetch task: no MR job. - List rows = shell.executeStatement("SELECT * FROM default.customers"); - - assertThat(rows) - .containsExactly( - new Object[] {0L, "Alice", "Brown"}, - new Object[] {1L, "Bob", "Green"}, - new Object[] {2L, "Trudy", "Pink"}); - } - - @TestTemplate - public void testScanTableCaseInsensitive() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - List rows = shell.executeStatement("SELECT * FROM default.customers"); - - assertThat(rows) - .containsExactly( - new Object[] {0L, "Alice", "Brown"}, - new Object[] {1L, "Bob", "Green"}, - new Object[] {2L, "Trudy", "Pink"}); - - rows = - shell.executeStatement( - "SELECT * FROM default.customers where CustomER_Id < 2 " - + "and first_name in ('Alice', 'Bob')"); - - assertThat(rows) - .containsExactly(new Object[] {0L, "Alice", "Brown"}, new Object[] {1L, "Bob", "Green"}); - } - - @TestTemplate - public void testDecimalTableWithPredicateLiterals() throws IOException { - Schema schema = new Schema(required(1, "decimal_field", Types.DecimalType.of(7, 2))); - List records = - TestHelper.RecordsBuilder.newInstance(schema) - .add(new BigDecimal("85.00")) - .add(new BigDecimal("100.56")) - .add(new BigDecimal("100.57")) - .build(); - testTables.createTable(shell, "dec_test", schema, fileFormat, records); - - // Use integer literal in predicate - List rows = - shell.executeStatement("SELECT * FROM default.dec_test where decimal_field >= 85"); - assertThat(rows) - .containsExactly(new Object[] {"85.00"}, new Object[] {"100.56"}, new Object[] {"100.57"}); - - // Use decimal literal in predicate with smaller scale than schema type definition - rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 99.1"); - assertThat(rows).containsExactly(new Object[] {"100.56"}, new Object[] {"100.57"}); - - // Use decimal literal in predicate with higher scale than schema type definition - rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 100.565"); - assertThat(rows).containsExactly(new Object[] {"100.57"}); - - // Use decimal literal in predicate with the same scale as schema type definition - rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field > 640.34"); - assertThat(rows).isEmpty(); - } - - @TestTemplate - public void testColumnSelection() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - List outOfOrderColumns = - shell.executeStatement("SELECT first_name, customer_id, last_name FROM default.customers"); - - assertThat(outOfOrderColumns) - .containsExactly( - new Object[] {"Alice", 0L, "Brown"}, - new Object[] {"Bob", 1L, "Green"}, - new Object[] {"Trudy", 2L, "Pink"}); - - List allButFirstColumn = - shell.executeStatement("SELECT first_name, last_name FROM default.customers"); - - assertThat(allButFirstColumn) - .containsExactly( - new Object[] {"Alice", "Brown"}, - new Object[] {"Bob", "Green"}, - new Object[] {"Trudy", "Pink"}); - - List allButMiddleColumn = - shell.executeStatement("SELECT customer_id, last_name FROM default.customers"); - - assertThat(allButMiddleColumn) - .containsExactly( - new Object[] {0L, "Brown"}, new Object[] {1L, "Green"}, new Object[] {2L, "Pink"}); - - List allButLastColumn = - shell.executeStatement("SELECT customer_id, first_name FROM default.customers"); - - assertThat(allButLastColumn) - .containsExactly( - new Object[] {0L, "Alice"}, new Object[] {1L, "Bob"}, new Object[] {2L, "Trudy"}); - } - - @TestTemplate - public void selectSameColumnTwice() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - List columns = - shell.executeStatement("SELECT first_name, first_name FROM default.customers"); - - assertThat(columns) - .containsExactly( - new Object[] {"Alice", "Alice"}, - new Object[] {"Bob", "Bob"}, - new Object[] {"Trudy", "Trudy"}); - } - - @TestTemplate - public void testCreateTableWithColumnSpecification() throws IOException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - Map> data = Maps.newHashMapWithExpectedSize(1); - data.put(null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " - + "last_name STRING COMMENT 'This is last name')" - + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest( - identifier, - createSql, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - PartitionSpec.unpartitioned(), - data); - } - - @TestTemplate - public void testCreateTableWithColumnSpecificationPartitioned() throws IOException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name") - .build(); - Map> data = - ImmutableMap.of( - Row.of("Brown"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Green"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Pink"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name') " - + "PARTITIONED BY (last_name STRING COMMENT 'This is last name') STORED BY " - + "'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest( - identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); - } - - @TestTemplate - public void testCreatePartitionedTableByProperty() throws IOException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name") - .build(); - Map> data = - ImmutableMap.of( - Row.of("Brown"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Green"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Pink"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(spec) - + "', " - + "'" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"; - runCreateAndReadTest( - identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); - } - - @TestTemplate - public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("first_name") - .identity("last_name") - .build(); - Map> data = - ImmutableMap.of( - Row.of("Alice", "Brown"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Bob", "Green"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Trudy", "Pink"), - Collections.singletonList( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " (customer_id BIGINT) " - + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " - + "last_name STRING COMMENT 'This is last name') " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest( - identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); - } - - @TestTemplate - public void testArrayOfPrimitivesInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, "arrayofprimitives", Types.ListType.ofRequired(2, Types.IntegerType.get()))); - List records = - testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); - // access a single element from the array - for (int i = 0; i < records.size(); i++) { - List expectedList = (List) records.get(i).getField("arrayofprimitives"); - for (int j = 0; j < expectedList.size(); j++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT arrayofprimitives[%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", - j, i)); - assertThat(queryResult.get(0)[0]).isEqualTo(expectedList.get(j)); - } - } - } - - @TestTemplate - public void testArrayOfArraysInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "arrayofarrays", - Types.ListType.ofRequired(2, Types.ListType.ofRequired(3, Types.DateType.get())))); - List records = - testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); - // access an element from a matrix - for (int i = 0; i < records.size(); i++) { - List expectedList = (List) records.get(i).getField("arrayofarrays"); - for (int j = 0; j < expectedList.size(); j++) { - List expectedInnerList = (List) expectedList.get(j); - for (int k = 0; k < expectedInnerList.size(); k++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT arrayofarrays[%d][%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", - j, k, i)); - assertThat(queryResult.get(0)[0]).isEqualTo(expectedInnerList.get(k).toString()); - } - } - } - } - - @TestTemplate - public void testArrayOfMapsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "arrayofmaps", - Types.ListType.ofRequired( - 2, - Types.MapType.ofRequired( - 3, 4, Types.StringType.get(), Types.BooleanType.get())))); - List records = - testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); - // access an element from a map in an array - for (int i = 0; i < records.size(); i++) { - List expectedList = (List) records.get(i).getField("arrayofmaps"); - for (int j = 0; j < expectedList.size(); j++) { - Map expectedMap = (Map) expectedList.get(j); - for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT arrayofmaps[%d][\"%s\"] FROM default.arraytable LIMIT 1 OFFSET %d", - j, entry.getKey(), i)); - assertThat(queryResult.get(0)[0]).isEqualTo(entry.getValue()); - } - } - } - } - - @TestTemplate - public void testArrayOfStructsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "arrayofstructs", - Types.ListType.ofRequired( - 2, - Types.StructType.of( - required(3, "something", Types.DoubleType.get()), - required(4, "someone", Types.LongType.get()), - required(5, "somewhere", Types.StringType.get()))))); - List records = - testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); - // access an element from a struct in an array - for (int i = 0; i < records.size(); i++) { - List expectedList = (List) records.get(i).getField("arrayofstructs"); - for (int j = 0; j < expectedList.size(); j++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT arrayofstructs[%d].something, " - + "arrayofstructs[%d].someone, arrayofstructs[%d].somewhere FROM default.arraytable LIMIT 1 " - + "OFFSET %d", - j, j, j, i)); - GenericRecord genericRecord = (GenericRecord) expectedList.get(j); - assertThat(queryResult.get(0)) - .containsExactly( - genericRecord.getField("something"), - genericRecord.getField("someone"), - genericRecord.getField("somewhere")); - } - } - } - - @TestTemplate - public void testMapOfPrimitivesInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "mapofprimitives", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.IntegerType.get()))); - List records = - testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); - // access a single value from the map - for (int i = 0; i < records.size(); i++) { - Map expectedMap = (Map) records.get(i).getField("mapofprimitives"); - for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT mapofprimitives[\"%s\"] " + "FROM default.maptable LIMIT 1 OFFSET %d", - entry.getKey(), i)); - assertThat(queryResult.get(0)[0]).isEqualTo(entry.getValue()); - } - } - } - - @TestTemplate - public void testMapOfArraysInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "mapofarrays", - Types.MapType.ofRequired( - 2, - 3, - Types.StringType.get(), - Types.ListType.ofRequired(4, Types.DateType.get())))); - List records = - testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); - // access a single element from a list in a map - for (int i = 0; i < records.size(); i++) { - Map expectedMap = (Map) records.get(i).getField("mapofarrays"); - for (Map.Entry entry : expectedMap.entrySet()) { - List expectedList = (List) entry.getValue(); - for (int j = 0; j < expectedList.size(); j++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT mapofarrays[\"%s\"]" + "[%d] FROM maptable LIMIT 1 OFFSET %d", - entry.getKey(), j, i)); - assertThat(queryResult.get(0)[0]).isEqualTo(expectedList.get(j).toString()); - } - } - } - } - - @TestTemplate - public void testMapOfMapsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "mapofmaps", - Types.MapType.ofRequired( - 2, - 3, - Types.StringType.get(), - Types.MapType.ofRequired( - 4, 5, Types.StringType.get(), Types.StringType.get())))); - List records = - testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); - // access a single element from a map in a map - for (int i = 0; i < records.size(); i++) { - Map expectedMap = (Map) records.get(i).getField("mapofmaps"); - for (Map.Entry entry : expectedMap.entrySet()) { - Map expectedInnerMap = (Map) entry.getValue(); - for (Map.Entry innerEntry : expectedInnerMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT mapofmaps[\"%s\"]" + "[\"%s\"] FROM maptable LIMIT 1 OFFSET %d", - entry.getKey(), innerEntry.getKey(), i)); - assertThat(queryResult.get(0)[0]).isEqualTo(innerEntry.getValue()); - } - } - } - } - - @TestTemplate - public void testMapOfStructsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "mapofstructs", - Types.MapType.ofRequired( - 2, - 3, - Types.StringType.get(), - Types.StructType.of( - required(4, "something", Types.DoubleType.get()), - required(5, "someone", Types.LongType.get()), - required(6, "somewhere", Types.StringType.get()))))); - List records = - testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); - // access a single element from a struct in a map - for (int i = 0; i < records.size(); i++) { - Map expectedMap = (Map) records.get(i).getField("mapofstructs"); - for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT mapofstructs[\"%s\"].something, " - + "mapofstructs[\"%s\"].someone, mapofstructs[\"%s\"].somewhere FROM default.maptable LIMIT 1 " - + "OFFSET %d", - entry.getKey(), entry.getKey(), entry.getKey(), i)); - GenericRecord genericRecord = (GenericRecord) entry.getValue(); - assertThat(queryResult.get(0)) - .containsExactly( - genericRecord.getField("something"), - genericRecord.getField("someone"), - genericRecord.getField("somewhere")); - } - } - } - - @TestTemplate - public void testStructOfPrimitivesInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "structofprimitives", - Types.StructType.of( - required(2, "key", Types.StringType.get()), - required(3, "value", Types.IntegerType.get())))); - List records = - testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); - // access a single value in a struct - for (int i = 0; i < records.size(); i++) { - GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofprimitives"); - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofprimitives.key, structofprimitives.value FROM default.structtable LIMIT 1 OFFSET %d", - i)); - assertThat(queryResult.get(0)) - .containsExactly(expectedStruct.getField("key"), expectedStruct.getField("value")); - } - } - - @TestTemplate - public void testStructOfArraysInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "structofarrays", - Types.StructType.of( - required(2, "names", Types.ListType.ofRequired(3, Types.StringType.get())), - required(4, "birthdays", Types.ListType.ofRequired(5, Types.DateType.get()))))); - List records = - testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); - // access an element of an array inside a struct - for (int i = 0; i < records.size(); i++) { - GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofarrays"); - List expectedList = (List) expectedStruct.getField("names"); - for (int j = 0; j < expectedList.size(); j++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofarrays.names[%d] FROM default.structtable LIMIT 1 OFFSET %d", - j, i)); - assertThat(queryResult.get(0)[0]).isEqualTo(expectedList.get(j)); - } - expectedList = (List) expectedStruct.getField("birthdays"); - for (int j = 0; j < expectedList.size(); j++) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofarrays.birthdays[%d] FROM default.structtable LIMIT 1 OFFSET %d", - j, i)); - assertThat(queryResult.get(0)[0]).isEqualTo(expectedList.get(j).toString()); - } - } - } - - @TestTemplate - public void testStructOfMapsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "structofmaps", - Types.StructType.of( - required( - 2, - "map1", - Types.MapType.ofRequired( - 3, 4, Types.StringType.get(), Types.StringType.get())), - required( - 5, - "map2", - Types.MapType.ofRequired( - 6, 7, Types.StringType.get(), Types.IntegerType.get()))))); - List records = - testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); - // access a map entry inside a struct - for (int i = 0; i < records.size(); i++) { - GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofmaps"); - Map expectedMap = (Map) expectedStruct.getField("map1"); - for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofmaps.map1[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", - entry.getKey(), i)); - assertThat(queryResult.get(0)[0]).isEqualTo(entry.getValue()); - } - expectedMap = (Map) expectedStruct.getField("map2"); - for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofmaps.map2[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", - entry.getKey(), i)); - assertThat(queryResult.get(0)[0]).isEqualTo(entry.getValue()); - } - } - } - - @TestTemplate - public void testStructOfStructsInTable() throws IOException { - Schema schema = - new Schema( - required( - 1, - "structofstructs", - Types.StructType.of( - required( - 2, - "struct1", - Types.StructType.of( - required(3, "key", Types.StringType.get()), - required(4, "value", Types.IntegerType.get())))))); - List records = - testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); - // access a struct element inside a struct - for (int i = 0; i < records.size(); i++) { - GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofstructs"); - GenericRecord expectedInnerStruct = (GenericRecord) expectedStruct.getField("struct1"); - List queryResult = - shell.executeStatement( - String.format( - "SELECT structofstructs.struct1.key, structofstructs.struct1.value FROM default.structtable " - + "LIMIT 1 OFFSET %d", - i)); - assertThat(queryResult.get(0)) - .containsExactly( - expectedInnerStruct.getField("key"), expectedInnerStruct.getField("value")); - } - } - - private void runCreateAndReadTest( - TableIdentifier identifier, - String createSQL, - Schema expectedSchema, - PartitionSpec expectedSpec, - Map> data) - throws IOException { - shell.executeStatement(createSQL); - - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().asStruct()).isEqualTo(expectedSchema.asStruct()); - assertThat(icebergTable.spec()).isEqualTo(expectedSpec); - - List expected = Lists.newArrayList(); - for (StructLike partition : data.keySet()) { - testTables.appendIcebergTable( - shell.getHiveConf(), icebergTable, fileFormat, partition, data.get(partition)); - expected.addAll(data.get(partition)); - } - - List descRows = shell.executeStatement("SELECT * FROM " + identifier); - List records = HiveIcebergTestUtils.valueForRow(icebergTable.schema(), descRows); - - HiveIcebergTestUtils.validateData(expected, records, 0); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java deleted file mode 100644 index 6a297e4913e4..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java +++ /dev/null @@ -1,959 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.iceberg.BaseMetastoreTableOperations; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.hive.HiveSchemaUtil; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.thrift.TException; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestHiveIcebergStorageHandlerNoScan { - private static final PartitionSpec SPEC = PartitionSpec.unpartitioned(); - - private static final Schema COMPLEX_SCHEMA = - new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "name", Types.StringType.get()), - optional( - 3, - "employee_info", - Types.StructType.of( - optional(7, "employer", Types.StringType.get()), - optional(8, "id", Types.LongType.get()), - optional(9, "address", Types.StringType.get()))), - optional( - 4, - "places_lived", - Types.ListType.ofOptional( - 10, - Types.StructType.of( - optional(11, "street", Types.StringType.get()), - optional(12, "city", Types.StringType.get()), - optional(13, "country", Types.StringType.get())))), - optional( - 5, - "memorable_moments", - Types.MapType.ofOptional( - 14, - 15, - Types.StringType.get(), - Types.StructType.of( - optional(16, "year", Types.IntegerType.get()), - optional(17, "place", Types.StringType.get()), - optional(18, "details", Types.StringType.get())))), - optional( - 6, - "current_address", - Types.StructType.of( - optional( - 19, - "street_address", - Types.StructType.of( - optional(22, "street_number", Types.IntegerType.get()), - optional(23, "street_name", Types.StringType.get()), - optional(24, "street_type", Types.StringType.get()))), - optional(20, "country", Types.StringType.get()), - optional(21, "postal_code", Types.StringType.get())))); - - private static final Set IGNORED_PARAMS = - ImmutableSet.of("bucketing_version", "numFilesErasureCoded"); - - @Parameters(name = "catalog={0}") - public static Collection parameters() { - Collection testParams = Lists.newArrayList(); - for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { - testParams.add(new Object[] {testTableType}); - } - - return testParams; - } - - private static TestHiveShell shell; - - private TestTables testTables; - - @Parameter private TestTables.TestTableType testTableType; - - @TempDir private java.nio.file.Path temp; - - @BeforeAll - public static void beforeClass() { - shell = HiveIcebergStorageHandlerTestUtils.shell(); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution - // engines - HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - } - - @TestTemplate - public void testCreateDropTable() throws TException, IOException, InterruptedException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "', " - + "'" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) - + "', " - + "'dummy'='test', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"); - - // Check the Iceberg table data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().asStruct()) - .isEqualTo(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct()); - assertThat(icebergTable.spec()).isEqualTo(PartitionSpec.unpartitioned()); - - org.apache.hadoop.hive.metastore.api.Table hmsTable = - shell.metastore().getTable("default", "customers"); - Properties tableProperties = new Properties(); - hmsTable.getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); - if (!Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - shell.executeStatement("DROP TABLE customers"); - - // Check if the table was really dropped even from the Catalog - assertThatThrownBy(() -> testTables.loadTable(identifier)) - .isInstanceOf(NoSuchTableException.class) - .hasMessageStartingWith("Table does not exist"); - } else { - Path hmsTableLocation = new Path(hmsTable.getSd().getLocation()); - - // Drop the table - shell.executeStatement("DROP TABLE customers"); - - // Check if we drop an exception when trying to load the table - assertThatThrownBy(() -> testTables.loadTable(identifier)) - .isInstanceOf(NoSuchTableException.class) - .hasMessage("Table does not exist: default.customers"); - // Check if the files are removed - FileSystem fs = Util.getFs(hmsTableLocation, shell.getHiveConf()); - if (fs.exists(hmsTableLocation)) { - // if table directory has been deleted, we're good. This is the expected behavior in Hive4. - // if table directory exists, its contents should have been cleaned up, save for an empty - // metadata dir (Hive3). - assertThat(fs.listStatus(hmsTableLocation)).hasSize(1); - assertThat(fs.listStatus(new Path(hmsTableLocation, "metadata"))).isEmpty(); - } - } - } - - @TestTemplate - public void testCreateDropTableNonDefaultCatalog() throws TException, InterruptedException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - String catalogName = "nondefaultcatalog"; - testTables - .properties() - .entrySet() - .forEach( - e -> - shell.setHiveSessionValue( - e.getKey().replace(testTables.catalog, catalogName), e.getValue())); - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name'," - + " last_name STRING COMMENT 'This is last name')" - + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - shell.executeStatement(createSql); - - Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().asStruct()) - .isEqualTo(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct()); - - shell.executeStatement("DROP TABLE default.customers"); - // Check if the table was really dropped even from the Catalog - assertThatThrownBy(() -> testTables.loadTable(identifier)) - .isInstanceOf(NoSuchTableException.class) - .hasMessageStartingWith("Table does not exist"); - } - - @TestTemplate - public void testCreateTableWithoutSpec() { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "','" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"); - - // Check the Iceberg table partition data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.spec()).isEqualTo(PartitionSpec.unpartitioned()); - } - - @TestTemplate - public void testCreateTableWithUnpartitionedSpec() { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - // We need the location for HadoopTable based tests only - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "', " - + "'" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) - + "', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"); - - // Check the Iceberg table partition data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.spec()).isEqualTo(SPEC); - } - - @TestTemplate - public void testCreateTableWithFormatV2ThroughTableProperty() { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - // We need the location for HadoopTable based tests only - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "', " - + "'" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) - + "', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "', " - + "'" - + TableProperties.FORMAT_VERSION - + "'='" - + 2 - + "')"); - - // Check the Iceberg table partition data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(((BaseTable) icebergTable).operations().current().formatVersion()) - .as("should create table using format v2") - .isEqualTo(2); - } - - @TestTemplate - public void testDeleteBackingTable() throws TException, IOException, InterruptedException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "', " - + "'" - + InputFormatConfig.EXTERNAL_TABLE_PURGE - + "'='FALSE', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"); - - org.apache.hadoop.hive.metastore.api.Table hmsTable = - shell.metastore().getTable("default", "customers"); - Properties tableProperties = new Properties(); - hmsTable.getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); - if (!Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - shell.executeStatement("DROP TABLE customers"); - - // Check if the table remains - testTables.loadTable(identifier); - } else { - // Check the HMS table parameters - Path hmsTableLocation = new Path(hmsTable.getSd().getLocation()); - - // Drop the table - shell.executeStatement("DROP TABLE customers"); - - // Check if we drop an exception when trying to drop the table - assertThatThrownBy(() -> testTables.loadTable(identifier)) - .isInstanceOf(NoSuchTableException.class) - .hasMessage("Table does not exist: default.customers"); - - // Check if the files are kept - FileSystem fs = Util.getFs(hmsTableLocation, shell.getHiveConf()); - assertThat(fs.listStatus(hmsTableLocation)).hasSize(1); - assertThat(fs.listStatus(new Path(hmsTableLocation, "metadata"))).hasSize(1); - } - } - - @TestTemplate - public void testDropTableWithCorruptedMetadata() - throws TException, IOException, InterruptedException { - assumeThat(testTableType) - .as("Only HiveCatalog attempts to load the Iceberg table prior to dropping it.") - .isEqualTo(TestTables.TestTableType.HIVE_CATALOG); - - // create test table - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - testTables.createTable( - shell, - identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - FileFormat.PARQUET, - ImmutableList.of()); - - // enable data purging (this should set external.table.purge=true on the HMS table) - Table table = testTables.loadTable(identifier); - table.updateProperties().set(GC_ENABLED, "true").commit(); - - // delete its current snapshot file (i.e. corrupt the metadata to make the Iceberg table - // unloadable) - String metadataLocation = - shell - .metastore() - .getTable(identifier) - .getParameters() - .get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); - table.io().deleteFile(metadataLocation); - - // check if HMS table is nonetheless still droppable - shell.executeStatement(String.format("DROP TABLE %s", identifier)); - assertThatThrownBy(() -> testTables.loadTable(identifier)) - .isInstanceOf(NoSuchTableException.class) - .hasMessage("Table does not exist: default.customers"); - } - - @TestTemplate - public void testCreateTableError() { - TableIdentifier identifier = TableIdentifier.of("default", "withShell2"); - - // Wrong schema - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE withShell2 " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='WrongSchema'" - + ",'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')")) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageContaining("Unrecognized token 'WrongSchema'"); - - // Missing schema, we try to get the schema from the table and fail - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE withShell2 " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageContaining("Please provide an existing table or a valid schema"); - - if (!testTables.locationForCreateTableSQL(identifier).isEmpty()) { - // Only test this if the location is required - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE withShell2 " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "','" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')")) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageEndingWith("Table location not set"); - } - } - - @TestTemplate - public void testCreateTableAboveExistingTable() throws IOException { - // Create the Iceberg table - testTables.createIcebergTable( - shell.getHiveConf(), - "customers", - COMPLEX_SCHEMA, - FileFormat.PARQUET, - Collections.emptyList()); - - if (testTableType == TestTables.TestTableType.HIVE_CATALOG) { - // In HiveCatalog we just expect an exception since the table is already exists - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - + "',' " - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')")) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageContaining("customers already exists"); - } else { - // With other catalogs, table creation should succeed - shell.executeStatement( - "CREATE EXTERNAL TABLE customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - } - } - - @TestTemplate - public void testCreatePartitionedTableWithPropertiesAndWithColumnSpecification() { - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name") - .build(); - - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE customers (customer_id BIGINT) " - + "PARTITIONED BY (first_name STRING) " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL( - TableIdentifier.of("default", "customers")) - + " TBLPROPERTIES ('" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(spec) - + "')")) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageEndingWith( - "Provide only one of the following: Hive partition specification, or the iceberg.mr.table.partition.spec property"); - } - - @TestTemplate - public void testCreateTableWithColumnSpecificationHierarchy() { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - shell.executeStatement( - "CREATE EXTERNAL TABLE customers (" - + "id BIGINT, name STRING, " - + "employee_info STRUCT < employer: STRING, id: BIGINT, address: STRING >, " - + "places_lived ARRAY < STRUCT >, " - + "memorable_moments MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>, " - + "current_address STRUCT < street_address: STRUCT " - + ", country: STRING, postal_code: STRING >) " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - - // Check the Iceberg table data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().asStruct()).isEqualTo(COMPLEX_SCHEMA.asStruct()); - } - - @TestTemplate - public void testCreateTableWithAllSupportedTypes() { - TableIdentifier identifier = TableIdentifier.of("default", "all_types"); - Schema allSupportedSchema = - new Schema( - optional(1, "t_float", Types.FloatType.get()), - optional(2, "t_double", Types.DoubleType.get()), - optional(3, "t_boolean", Types.BooleanType.get()), - optional(4, "t_int", Types.IntegerType.get()), - optional(5, "t_bigint", Types.LongType.get()), - optional(6, "t_binary", Types.BinaryType.get()), - optional(7, "t_string", Types.StringType.get()), - optional(8, "t_timestamp", Types.TimestampType.withoutZone()), - optional(9, "t_date", Types.DateType.get()), - optional(10, "t_decimal", Types.DecimalType.of(3, 2))); - - // Intentionally adding some mixed letters to test that we handle them correctly - shell.executeStatement( - "CREATE EXTERNAL TABLE all_types (" - + "t_Float FLOaT, t_dOuble DOUBLE, t_boolean BOOLEAN, t_int INT, t_bigint BIGINT, t_binary BINARY, " - + "t_string STRING, t_timestamp TIMESTAMP, t_date DATE, t_decimal DECIMAL(3,2)) " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - - // Check the Iceberg table data - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().asStruct()).isEqualTo(allSupportedSchema.asStruct()); - } - - @TestTemplate - public void testCreateTableWithNotSupportedTypes() { - TableIdentifier identifier = TableIdentifier.of("default", "not_supported_types"); - // Can not create INTERVAL types from normal create table, so leave them out from this test - Map notSupportedTypes = - ImmutableMap.of( - "TINYINT", Types.IntegerType.get(), - "SMALLINT", Types.IntegerType.get(), - "VARCHAR(1)", Types.StringType.get(), - "CHAR(1)", Types.StringType.get()); - - for (String notSupportedType : notSupportedTypes.keySet()) { - assertThatThrownBy( - () -> - shell.executeStatement( - "CREATE EXTERNAL TABLE not_supported_types " - + "(not_supported " - + notSupportedType - + ") " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of()))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Failed to execute Hive query") - .hasMessageContaining("Unsupported Hive type"); - } - } - - @TestTemplate - public void testCreateTableWithNotSupportedTypesWithAutoConversion() { - TableIdentifier identifier = TableIdentifier.of("default", "not_supported_types"); - // Can not create INTERVAL types from normal create table, so leave them out from this test - Map notSupportedTypes = - ImmutableMap.of( - "TINYINT", - Types.IntegerType.get(), - "SMALLINT", - Types.IntegerType.get(), - "VARCHAR(1)", - Types.StringType.get(), - "CHAR(1)", - Types.StringType.get()); - - shell.setHiveSessionValue(InputFormatConfig.SCHEMA_AUTO_CONVERSION, "true"); - - for (String notSupportedType : notSupportedTypes.keySet()) { - shell.executeStatement( - "CREATE EXTERNAL TABLE not_supported_types (not_supported " - + notSupportedType - + ") " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.schema().columns().get(0).type()) - .isEqualTo(notSupportedTypes.get(notSupportedType)); - shell.executeStatement("DROP TABLE not_supported_types"); - } - } - - @TestTemplate - public void testCreateTableWithColumnComments() { - TableIdentifier identifier = TableIdentifier.of("default", "comment_table"); - shell.executeStatement( - "CREATE EXTERNAL TABLE comment_table (" - + "t_int INT COMMENT 'int column', " - + "t_string STRING COMMENT 'string column', " - + "t_string_2 STRING) " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - - List rows = shell.executeStatement("DESCRIBE default.comment_table"); - assertThat(rows).hasSameSizeAs(icebergTable.schema().columns()); - for (int i = 0; i < icebergTable.schema().columns().size(); i++) { - Types.NestedField field = icebergTable.schema().columns().get(i); - assertThat(rows.get(i)) - .containsExactly( - field.name(), - HiveSchemaUtil.convert(field.type()).getTypeName(), - (field.doc() != null ? field.doc() : "from deserializer")); - } - } - - @TestTemplate - public void testCreateTableWithoutColumnComments() { - TableIdentifier identifier = TableIdentifier.of("default", "without_comment_table"); - shell.executeStatement( - "CREATE EXTERNAL TABLE without_comment_table (" - + "t_int INT, " - + "t_string STRING) " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - - List rows = shell.executeStatement("DESCRIBE default.without_comment_table"); - assertThat(rows).hasSameSizeAs(icebergTable.schema().columns()); - for (int i = 0; i < icebergTable.schema().columns().size(); i++) { - Types.NestedField field = icebergTable.schema().columns().get(i); - assertThat(field.doc()).isNull(); - assertThat(rows.get(i)) - .containsExactly( - field.name(), - HiveSchemaUtil.convert(field.type()).getTypeName(), - (field.doc() != null ? field.doc() : "from deserializer")); - } - } - - @TestTemplate - public void testIcebergAndHmsTableProperties() throws Exception { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - shell.executeStatement( - String.format( - "CREATE EXTERNAL TABLE default.customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' %s" - + "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s', '%s'='%s')", - testTables.locationForCreateTableSQL( - identifier), // we need the location for HadoopTable based tests only - InputFormatConfig.TABLE_SCHEMA, - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), - InputFormatConfig.PARTITION_SPEC, - PartitionSpecParser.toJson(SPEC), - "custom_property", - "initial_val", - InputFormatConfig.CATALOG_NAME, - testTables.catalogName())); - - // Check the Iceberg table parameters - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - - Map expectedIcebergProperties = Maps.newHashMap(); - expectedIcebergProperties.put("custom_property", "initial_val"); - expectedIcebergProperties.put("EXTERNAL", "TRUE"); - expectedIcebergProperties.put("storage_handler", HiveIcebergStorageHandler.class.getName()); - expectedIcebergProperties.put( - TableProperties.PARQUET_COMPRESSION, - TableProperties.PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0); - - // Check the HMS table parameters - org.apache.hadoop.hive.metastore.api.Table hmsTable = - shell.metastore().getTable("default", "customers"); - Map hmsParams = - hmsTable.getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - Properties tableProperties = new Properties(); - tableProperties.putAll(hmsParams); - - if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - expectedIcebergProperties.put(TableProperties.ENGINE_HIVE_ENABLED, "true"); - } - if (HiveVersion.min(HiveVersion.HIVE_3)) { - expectedIcebergProperties.put("bucketing_version", "2"); - } - assertThat(icebergTable.properties()).isEqualTo((expectedIcebergProperties)); - - if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - assertThat(hmsParams) - .hasSize(15) - .containsEntry("custom_property", "initial_val") - .containsEntry(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE") - .containsEntry("EXTERNAL", "TRUE") - .containsEntry(TableProperties.ENGINE_HIVE_ENABLED, "true") - .containsEntry( - hive_metastoreConstants.META_TABLE_STORAGE, HiveIcebergStorageHandler.class.getName()) - .containsEntry( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ROOT)) - .containsEntry( - BaseMetastoreTableOperations.METADATA_LOCATION_PROP, - getCurrentSnapshotForHiveCatalogTable(icebergTable)) - .doesNotContainKey(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP) - .containsKey(hive_metastoreConstants.DDL_TIME) - .containsKey(InputFormatConfig.PARTITION_SPEC); - } else { - assertThat(hmsParams).hasSize(8).doesNotContainKey(TableProperties.ENGINE_HIVE_ENABLED); - } - - // Check HMS inputformat/outputformat/serde - assertThat(hmsTable.getSd().getInputFormat()).isEqualTo(HiveIcebergInputFormat.class.getName()); - assertThat(hmsTable.getSd().getOutputFormat()) - .isEqualTo(HiveIcebergOutputFormat.class.getName()); - assertThat(hmsTable.getSd().getSerdeInfo().getSerializationLib()) - .isEqualTo(HiveIcebergSerDe.class.getName()); - - // Add two new properties to the Iceberg table and update an existing one - icebergTable - .updateProperties() - .set("new_prop_1", "true") - .set("new_prop_2", "false") - .set("custom_property", "new_val") - .commit(); - - // Refresh the HMS table to see if new Iceberg properties got synced into HMS - hmsParams = - shell.metastore().getTable("default", "customers").getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - - if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - assertThat(hmsParams) - .hasSize(18) - .containsEntry("new_prop_1", "true") - .containsEntry("new_prop_2", "false") - .containsEntry("custom_property", "new_val"); - String prevSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable); - icebergTable.refresh(); - String newSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable); - assertThat(hmsParams) - .containsEntry(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP, prevSnapshot) - .containsEntry(BaseMetastoreTableOperations.METADATA_LOCATION_PROP, newSnapshot); - } else { - assertThat(hmsParams).hasSize(8); - } - - // Remove some Iceberg props and see if they're removed from HMS table props as well - if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - icebergTable.updateProperties().remove("custom_property").remove("new_prop_1").commit(); - hmsParams = shell.metastore().getTable("default", "customers").getParameters(); - assertThat(hmsParams) - .doesNotContainKey("custom_property") - .doesNotContainKey("new_prop_1") - .containsKey("new_prop_2"); - } - - // append some data and check whether HMS stats are aligned with snapshot summary - if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - List records = HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS; - testTables.appendIcebergTable( - shell.getHiveConf(), icebergTable, FileFormat.PARQUET, null, records); - hmsParams = shell.metastore().getTable("default", "customers").getParameters(); - Map summary = icebergTable.currentSnapshot().summary(); - assertThat(hmsParams) - .containsEntry( - StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP)) - .containsEntry(StatsSetupConst.ROW_COUNT, summary.get(SnapshotSummary.TOTAL_RECORDS_PROP)) - .containsEntry( - StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); - } - } - - @TestTemplate - public void testIcebergHMSPropertiesTranslation() throws Exception { - assumeThat(testTableType) - .as("Iceberg - HMS property translation is only relevant for HiveCatalog") - .isEqualTo(TestTables.TestTableType.HIVE_CATALOG); - - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - // Create HMS table with with a property to be translated - shell.executeStatement( - String.format( - "CREATE EXTERNAL TABLE default.customers " - + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'" - + "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s')", - InputFormatConfig.TABLE_SCHEMA, - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), - InputFormatConfig.PARTITION_SPEC, - PartitionSpecParser.toJson(SPEC), - InputFormatConfig.EXTERNAL_TABLE_PURGE, - "false")); - - // Check that HMS table prop was translated to equivalent Iceberg prop (purge -> gc.enabled) - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - assertThat(icebergTable.properties()) - .containsEntry(GC_ENABLED, "false") - .doesNotContainKey(InputFormatConfig.EXTERNAL_TABLE_PURGE); - - // Change Iceberg prop - icebergTable.updateProperties().set(GC_ENABLED, "true").commit(); - - // Check that Iceberg prop was translated to equivalent HMS prop (gc.enabled -> purge) - Map hmsParams = - shell.metastore().getTable("default", "customers").getParameters(); - assertThat(hmsParams) - .containsEntry(InputFormatConfig.EXTERNAL_TABLE_PURGE, "true") - .doesNotContainKey(GC_ENABLED); - } - - @TestTemplate - public void testDropTableWithAppendedData() throws IOException { - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - testTables.createTable( - shell, - identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - SPEC, - FileFormat.PARQUET, - ImmutableList.of()); - - org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - testTables.appendIcebergTable( - shell.getHiveConf(), - icebergTable, - FileFormat.PARQUET, - null, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - shell.executeStatement("DROP TABLE customers"); - } - - @TestTemplate - public void testDropHiveTableWithoutUnderlyingTable() throws IOException { - assumeThat(testTableType) - .as("Not relevant for HiveCatalog") - .isNotEqualTo(TestTables.TestTableType.HIVE_CATALOG); - - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - // Create the Iceberg table in non-HiveCatalog - testTables.createIcebergTable( - shell.getHiveConf(), - identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - FileFormat.PARQUET, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - // Create Hive table on top - String tableLocation = testTables.locationForCreateTableSQL(identifier); - shell.executeStatement( - testTables.createHiveTableSQL( - identifier, ImmutableMap.of(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE"))); - - // Drop the Iceberg table - Properties properties = new Properties(); - properties.put(Catalogs.NAME, identifier.toString()); - properties.put(Catalogs.LOCATION, tableLocation); - Catalogs.dropTable(shell.getHiveConf(), properties); - - // Finally drop the Hive table as well - shell.executeStatement("DROP TABLE " + identifier); - } - - private String getCurrentSnapshotForHiveCatalogTable(org.apache.iceberg.Table icebergTable) { - return ((BaseMetastoreTableOperations) ((BaseTable) icebergTable).operations()) - .currentMetadataLocation(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java deleted file mode 100644 index b8a454d01f02..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.text.DateFormat; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Optional; -import java.util.TimeZone; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestHiveIcebergStorageHandlerTimezone { - private static final Optional> DATE_FORMAT = - Optional.ofNullable( - (ThreadLocal) - DynFields.builder() - .hiddenImpl(TimestampWritable.class, "threadLocalDateFormat") - .defaultAlwaysNull() - .buildStatic() - .get()); - - private static final Optional> LOCAL_TIME_ZONE = - Optional.ofNullable( - (ThreadLocal) - DynFields.builder() - .hiddenImpl(DateWritable.class, "LOCAL_TIMEZONE") - .defaultAlwaysNull() - .buildStatic() - .get()); - - @Parameters(name = "timezone={0}") - public static Collection parameters() { - return ImmutableList.of( - new String[] {"America/New_York"}, - new String[] {"Asia/Kolkata"}, - new String[] {"UTC/Greenwich"}); - } - - private static TestHiveShell shell; - - private TestTables testTables; - - @Parameter private String timezoneString; - - @TempDir private Path temp; - - @BeforeAll - public static void beforeClass() { - shell = HiveIcebergStorageHandlerTestUtils.shell(); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - TimeZone.setDefault(TimeZone.getTimeZone(timezoneString)); - - // Magic to clean cached date format and local timezone for Hive where the default timezone is - // used/stored in the - // cached object - DATE_FORMAT.ifPresent(ThreadLocal::remove); - LOCAL_TIME_ZONE.ifPresent(ThreadLocal::remove); - - this.testTables = - HiveIcebergStorageHandlerTestUtils.testTables( - shell, TestTables.TestTableType.HIVE_CATALOG, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution - // engines - HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - } - - @TestTemplate - public void testDateQuery() throws IOException { - Schema dateSchema = new Schema(optional(1, "d_date", Types.DateType.get())); - - List records = - TestHelper.RecordsBuilder.newInstance(dateSchema) - .add(LocalDate.of(2020, 1, 21)) - .add(LocalDate.of(2020, 1, 24)) - .build(); - - testTables.createTable(shell, "date_test", dateSchema, FileFormat.PARQUET, records); - - List result = - shell.executeStatement("SELECT * from date_test WHERE d_date='2020-01-21'"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2020-01-21"); - - result = - shell.executeStatement( - "SELECT * from date_test WHERE d_date in ('2020-01-21', '2020-01-22')"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2020-01-21"); - - result = shell.executeStatement("SELECT * from date_test WHERE d_date > '2020-01-21'"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2020-01-24"); - - result = shell.executeStatement("SELECT * from date_test WHERE d_date='2020-01-20'"); - assertThat(result).isEmpty(); - } - - @TestTemplate - public void testTimestampQuery() throws IOException { - Schema timestampSchema = new Schema(optional(1, "d_ts", Types.TimestampType.withoutZone())); - - List records = - TestHelper.RecordsBuilder.newInstance(timestampSchema) - .add(LocalDateTime.of(2019, 1, 22, 9, 44, 54, 100000000)) - .add(LocalDateTime.of(2019, 2, 22, 9, 44, 54, 200000000)) - .build(); - - testTables.createTable(shell, "ts_test", timestampSchema, FileFormat.PARQUET, records); - - List result = - shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts='2019-02-22 09:44:54.2'"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2019-02-22 09:44:54.2"); - - result = - shell.executeStatement( - "SELECT * FROM ts_test WHERE d_ts in ('2017-01-01 22:30:57.1', '2019-02-22 09:44:54.2')"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2019-02-22 09:44:54.2"); - - result = - shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts < '2019-02-22 09:44:54.2'"); - assertThat(result).hasSize(1); - assertThat(result.get(0)[0]).isEqualTo("2019-01-22 09:44:54.1"); - - result = shell.executeStatement("SELECT * FROM ts_test WHERE d_ts='2017-01-01 22:30:57.3'"); - assertThat(result).isEmpty(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java deleted file mode 100644 index ce3a6fd92441..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java +++ /dev/null @@ -1,1364 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hadoop.ConfigProperties; -import org.apache.iceberg.hive.HiveSchemaUtil; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -@Timeout(value = 200_000, unit = TimeUnit.MILLISECONDS) -public class TestHiveIcebergStorageHandlerWithEngine { - - private static final String[] EXECUTION_ENGINES = new String[] {"tez", "mr"}; - - private static final Schema ORDER_SCHEMA = - new Schema( - required(1, "order_id", Types.LongType.get()), - required(2, "customer_id", Types.LongType.get()), - required(3, "total", Types.DoubleType.get()), - required(4, "product_id", Types.LongType.get())); - - private static final List ORDER_RECORDS = - TestHelper.RecordsBuilder.newInstance(ORDER_SCHEMA) - .add(100L, 0L, 11.11d, 1L) - .add(101L, 0L, 22.22d, 2L) - .add(102L, 1L, 33.33d, 3L) - .build(); - - private static final Schema PRODUCT_SCHEMA = - new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "name", Types.StringType.get()), - optional(3, "price", Types.DoubleType.get())); - - private static final List PRODUCT_RECORDS = - TestHelper.RecordsBuilder.newInstance(PRODUCT_SCHEMA) - .add(1L, "skirt", 11.11d) - .add(2L, "tee", 22.22d) - .add(3L, "watch", 33.33d) - .build(); - - private static final List SUPPORTED_TYPES = - ImmutableList.of( - Types.BooleanType.get(), - Types.IntegerType.get(), - Types.LongType.get(), - Types.FloatType.get(), - Types.DoubleType.get(), - Types.DateType.get(), - Types.TimestampType.withZone(), - Types.TimestampType.withoutZone(), - Types.StringType.get(), - Types.BinaryType.get(), - Types.DecimalType.of(3, 1), - Types.UUIDType.get(), - Types.FixedType.ofLength(5), - Types.TimeType.get()); - - @Parameters(name = "fileFormat={0}, engine={1}, catalog={2}, isVectorized={3}") - public static Collection parameters() { - Collection testParams = Lists.newArrayList(); - String javaVersion = System.getProperty("java.specification.version"); - - // Run tests with every FileFormat for a single Catalog (HiveCatalog) - for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) { - for (String engine : EXECUTION_ENGINES) { - // include Tez tests only for Java 8 - if (javaVersion.equals("1.8") || "mr".equals(engine)) { - testParams.add( - new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false}); - // test for vectorization=ON in case of ORC format and Tez engine - if ((fileFormat == FileFormat.PARQUET || fileFormat == FileFormat.ORC) - && "tez".equals(engine) - && HiveVersion.min(HiveVersion.HIVE_3)) { - testParams.add( - new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true}); - } - } - } - } - - // Run tests for every Catalog for a single FileFormat (PARQUET) and execution engine (mr) - // skip HiveCatalog tests as they are added before - for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { - if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[] {FileFormat.PARQUET, "mr", testTableType, false}); - } - } - - return testParams; - } - - private static TestHiveShell shell; - - private TestTables testTables; - - @Parameter(index = 0) - private FileFormat fileFormat; - - @Parameter(index = 1) - private String executionEngine; - - @Parameter(index = 2) - private TestTables.TestTableType testTableType; - - @Parameter(index = 3) - private boolean isVectorized; - - @TempDir private Path temp; - - @BeforeAll - public static void beforeClass() { - shell = HiveIcebergStorageHandlerTestUtils.shell(); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, executionEngine); - HiveConf.setBoolVar( - shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized); - if (isVectorized) { - HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION, "none"); - } else { - HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION, "more"); - } - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - // Mixing mr and tez jobs within the same JVM can cause problems. Mr jobs set the ExecMapper - // status to done=false - // at the beginning and to done=true at the end. However, tez jobs also rely on this value to - // see if they should - // proceed, but they do not reset it to done=false at the beginning. Therefore, without calling - // this after each test - // case, any tez job that follows a completed mr job will erroneously read done=true and will - // not proceed. - ExecMapper.setDone(false); - } - - @TestTemplate - public void testScanTable() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - // Adding the ORDER BY clause will cause Hive to spawn a local MR job this time. - List descRows = - shell.executeStatement( - "SELECT first_name, customer_id FROM default.customers ORDER BY customer_id DESC"); - - assertThat(descRows) - .containsExactly( - new Object[] {"Trudy", 2L}, new Object[] {"Bob", 1L}, new Object[] {"Alice", 0L}); - } - - @TestTemplate - public void testCBOWithSelectedColumnsNonOverlapJoin() throws IOException { - shell.setHiveSessionValue("hive.cbo.enable", true); - - testTables.createTable(shell, "products", PRODUCT_SCHEMA, fileFormat, PRODUCT_RECORDS); - testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - - List rows = - shell.executeStatement( - "SELECT o.order_id, o.customer_id, o.total, p.name " - + "FROM default.orders o JOIN default.products p ON o.product_id = p.id ORDER BY o.order_id"); - - assertThat(rows) - .containsExactly( - new Object[] {100L, 0L, 11.11d, "skirt"}, - new Object[] {101L, 0L, 22.22d, "tee"}, - new Object[] {102L, 1L, 33.33d, "watch"}); - } - - @TestTemplate - public void testDescribeTable() throws IOException { - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - List rows = shell.executeStatement("DESCRIBE default.customers"); - assertThat(rows).hasSameSizeAs(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns()); - for (int i = 0; i < HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().size(); i++) { - Types.NestedField field = HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().get(i); - String comment = field.doc() == null ? "from deserializer" : field.doc(); - assertThat(rows.get(i)) - .containsExactly( - field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), comment); - } - } - - @TestTemplate - public void testCBOWithSelectedColumnsOverlapJoin() throws IOException { - shell.setHiveSessionValue("hive.cbo.enable", true); - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - - List rows = - shell.executeStatement( - "SELECT c.first_name, o.order_id " - + "FROM default.orders o JOIN default.customers c ON o.customer_id = c.customer_id " - + "ORDER BY o.order_id DESC"); - - assertThat(rows) - .containsExactly( - new Object[] {"Bob", 102L}, new Object[] {"Alice", 101L}, new Object[] {"Alice", 100L}); - } - - @TestTemplate - public void testCBOWithSelfJoin() throws IOException { - shell.setHiveSessionValue("hive.cbo.enable", true); - - testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - - List rows = - shell.executeStatement( - "SELECT o1.order_id, o1.customer_id, o1.total " - + "FROM default.orders o1 JOIN default.orders o2 ON o1.order_id = o2.order_id ORDER BY o1.order_id"); - - assertThat(rows) - .containsExactly( - new Object[] {100L, 0L, 11.11d}, - new Object[] {101L, 0L, 22.22d}, - new Object[] {102L, 1L, 33.33d}); - } - - @TestTemplate - public void testJoinTablesSupportedTypes() throws IOException { - for (int i = 0; i < SUPPORTED_TYPES.size(); i++) { - Type type = SUPPORTED_TYPES.get(i); - if (isUnsupportedVectorizedTypeForHive(type)) { - continue; - } - // TODO: remove this filter when issue #1881 is resolved - if (type == Types.UUIDType.get() && fileFormat == FileFormat.PARQUET) { - continue; - } - String tableName = type.typeId().toString().toLowerCase(Locale.ROOT) + "_table_" + i; - String columnName = type.typeId().toString().toLowerCase(Locale.ROOT) + "_column"; - - Schema schema = new Schema(required(1, columnName, type)); - List records = TestHelper.generateRandomRecords(schema, 1, 0L); - - testTables.createTable(shell, tableName, schema, fileFormat, records); - List queryResult = - shell.executeStatement( - "select s." - + columnName - + ", h." - + columnName - + " from default." - + tableName - + " s join default." - + tableName - + " h on h." - + columnName - + "=s." - + columnName); - assertThat(queryResult) - .as("Non matching record count for table " + tableName + " with type " + type) - .hasSize(1); - } - } - - @TestTemplate - public void testSelectDistinctFromTable() throws IOException { - for (int i = 0; i < SUPPORTED_TYPES.size(); i++) { - Type type = SUPPORTED_TYPES.get(i); - if (isUnsupportedVectorizedTypeForHive(type)) { - continue; - } - // TODO: remove this filter when issue #1881 is resolved - if (type == Types.UUIDType.get() && fileFormat == FileFormat.PARQUET) { - continue; - } - String tableName = type.typeId().toString().toLowerCase(Locale.ROOT) + "_table_" + i; - String columnName = type.typeId().toString().toLowerCase(Locale.ROOT) + "_column"; - - Schema schema = new Schema(required(1, columnName, type)); - List records = TestHelper.generateRandomRecords(schema, 4, 0L); - int size = - records.stream().map(r -> r.getField(columnName)).collect(Collectors.toSet()).size(); - testTables.createTable(shell, tableName, schema, fileFormat, records); - List queryResult = - shell.executeStatement( - "select count(distinct(" + columnName + ")) from default." + tableName); - int distinctIds = ((Long) queryResult.get(0)[0]).intValue(); - assertThat(distinctIds).as(tableName).isEqualTo(size); - } - } - - @TestTemplate - public void testInsert() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Table table = - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - - // The expected query is like - // INSERT INTO customers VALUES (0, 'Alice'), (1, 'Bob'), (2, 'Trudy') - StringBuilder query = new StringBuilder().append("INSERT INTO customers VALUES "); - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.forEach( - record -> - query - .append("(") - .append(record.get(0)) - .append(",'") - .append(record.get(1)) - .append("','") - .append(record.get(2)) - .append("'),")); - query.setLength(query.length() - 1); - - shell.executeStatement(query.toString()); - - HiveIcebergTestUtils.validateData( - table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); - } - - @TestTemplate - public void testInsertSupportedTypes() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - for (int i = 0; i < SUPPORTED_TYPES.size(); i++) { - Type type = SUPPORTED_TYPES.get(i); - // TODO: remove this filter when issue #1881 is resolved - if (type == Types.UUIDType.get() && fileFormat == FileFormat.PARQUET) { - continue; - } - // TODO: remove this filter when we figure out how we could test binary types - if (type.equals(Types.BinaryType.get()) || type.equals(Types.FixedType.ofLength(5))) { - continue; - } - String columnName = type.typeId().toString().toLowerCase(Locale.ROOT) + "_column"; - - Schema schema = - new Schema(required(1, "id", Types.LongType.get()), required(2, columnName, type)); - List expected = TestHelper.generateRandomRecords(schema, 5, 0L); - - Table table = - testTables.createTable( - shell, - type.typeId().toString().toLowerCase(Locale.ROOT) + "_table_" + i, - schema, - PartitionSpec.unpartitioned(), - fileFormat, - expected); - - HiveIcebergTestUtils.validateData(table, expected, 0); - } - } - - /** - * Testing map only inserts. - * - * @throws IOException If there is an underlying IOException - */ - @TestTemplate - public void testInsertFromSelect() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Table table = - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - shell.executeStatement("INSERT INTO customers SELECT * FROM customers"); - - // Check that everything is duplicated as expected - List records = Lists.newArrayList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - records.addAll(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - HiveIcebergTestUtils.validateData(table, records, 0); - } - - /** - * Testing map-reduce inserts. - * - * @throws IOException If there is an underlying IOException - */ - @TestTemplate - public void testInsertFromSelectWithOrderBy() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Table table = - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - // We expect that there will be Mappers and Reducers here - shell.executeStatement("INSERT INTO customers SELECT * FROM customers ORDER BY customer_id"); - - // Check that everything is duplicated as expected - List records = Lists.newArrayList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - records.addAll(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - HiveIcebergTestUtils.validateData(table, records, 0); - } - - @TestTemplate - public void testInsertFromSelectWithProjection() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Table table = - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - - shell.executeStatement( - "INSERT INTO customers (customer_id, last_name) SELECT distinct(customer_id), 'test' FROM orders"); - - List expected = - TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .add(0L, null, "test") - .add(1L, null, "test") - .build(); - - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @TestTemplate - public void testInsertUsingSourceTableWithSharedColumnsNames() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - List records = HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS; - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name") - .build(); - testTables.createTable( - shell, - "source_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - records); - Table table = - testTables.createTable( - shell, - "target_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - ImmutableList.of()); - - // Below select from source table should produce: - // "hive.io.file.readcolumn.names=customer_id,last_name". - // Inserting into the target table should not fail because first_name is not selected from the - // source table - shell.executeStatement( - "INSERT INTO target_customers SELECT customer_id, 'Sam', last_name FROM source_customers"); - - List expected = Lists.newArrayListWithExpectedSize(records.size()); - records.forEach( - r -> { - Record copy = r.copy(); - copy.setField("first_name", "Sam"); - expected.add(copy); - }); - HiveIcebergTestUtils.validateData(table, expected, 0); - } - - @TestTemplate - public void testInsertFromJoiningTwoIcebergTables() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name") - .build(); - testTables.createTable( - shell, - "source_customers_1", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - testTables.createTable( - shell, - "source_customers_2", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - Table table = - testTables.createTable( - shell, - "target_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - ImmutableList.of()); - - shell.executeStatement( - "INSERT INTO target_customers SELECT a.customer_id, b.first_name, a.last_name FROM " - + "source_customers_1 a JOIN source_customers_2 b ON a.last_name = b.last_name"); - - HiveIcebergTestUtils.validateData( - table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); - } - - @TestTemplate - public void testWriteArrayOfPrimitivesInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required(2, "arrayofprimitives", Types.ListType.ofRequired(3, Types.StringType.get()))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteArrayOfArraysInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "arrayofarrays", - Types.ListType.ofRequired( - 3, Types.ListType.ofRequired(4, Types.StringType.get())))); - List records = TestHelper.generateRandomRecords(schema, 3, 1L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteArrayOfMapsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "arrayofmaps", - Types.ListType.ofRequired( - 3, - Types.MapType.ofRequired( - 4, 5, Types.StringType.get(), Types.StringType.get())))); - List records = TestHelper.generateRandomRecords(schema, 5, 1L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteArrayOfStructsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "arrayofstructs", - Types.ListType.ofRequired( - 3, - Types.StructType.of( - required(4, "something", Types.StringType.get()), - required(5, "someone", Types.StringType.get()), - required(6, "somewhere", Types.StringType.get()))))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteMapOfPrimitivesInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "mapofprimitives", - Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.StringType.get()))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteMapOfArraysInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "mapofarrays", - Types.MapType.ofRequired( - 3, - 4, - Types.StringType.get(), - Types.ListType.ofRequired(5, Types.StringType.get())))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteMapOfMapsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "mapofmaps", - Types.MapType.ofRequired( - 3, - 4, - Types.StringType.get(), - Types.MapType.ofRequired( - 5, 6, Types.StringType.get(), Types.StringType.get())))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteMapOfStructsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "mapofstructs", - Types.MapType.ofRequired( - 3, - 4, - Types.StringType.get(), - Types.StructType.of( - required(5, "something", Types.StringType.get()), - required(6, "someone", Types.StringType.get()), - required(7, "somewhere", Types.StringType.get()))))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteStructOfPrimitivesInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "structofprimitives", - Types.StructType.of( - required(3, "key", Types.StringType.get()), - required(4, "value", Types.StringType.get())))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteStructOfArraysInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "structofarrays", - Types.StructType.of( - required(3, "names", Types.ListType.ofRequired(4, Types.StringType.get())), - required( - 5, "birthdays", Types.ListType.ofRequired(6, Types.StringType.get()))))); - List records = TestHelper.generateRandomRecords(schema, 5, 1L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteStructOfMapsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "structofmaps", - Types.StructType.of( - required( - 3, - "map1", - Types.MapType.ofRequired( - 4, 5, Types.StringType.get(), Types.StringType.get())), - required( - 6, - "map2", - Types.MapType.ofRequired( - 7, 8, Types.StringType.get(), Types.StringType.get()))))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testWriteStructOfStructsInTable() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "structofstructs", - Types.StructType.of( - required( - 3, - "struct1", - Types.StructType.of( - required(4, "key", Types.StringType.get()), - required(5, "value", Types.StringType.get())))))); - List records = TestHelper.generateRandomRecords(schema, 5, 0L); - testComplexTypeWrite(schema, records); - } - - @TestTemplate - public void testPartitionedWrite() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .bucket("customer_id", 3) - .build(); - - List records = - TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - - Table table = - testTables.createTable( - shell, - "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - records); - - HiveIcebergTestUtils.validateData(table, records, 0); - } - - @TestTemplate - public void testIdentityPartitionedWrite() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("customer_id") - .build(); - - List records = - TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - - Table table = - testTables.createTable( - shell, - "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - records); - - HiveIcebergTestUtils.validateData(table, records, 0); - } - - @TestTemplate - public void testMultilevelIdentityPartitionedWrite() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("customer_id") - .identity("last_name") - .build(); - - List records = - TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - - Table table = - testTables.createTable( - shell, - "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, - fileFormat, - records); - - HiveIcebergTestUtils.validateData(table, records, 0); - } - - @TestTemplate - public void testMultiTableInsert() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - Schema target1Schema = - new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "first_name", Types.StringType.get())); - - Schema target2Schema = - new Schema( - optional(1, "last_name", Types.StringType.get()), - optional(2, "customer_id", Types.LongType.get())); - - List target1Records = - TestHelper.RecordsBuilder.newInstance(target1Schema) - .add(0L, "Alice") - .add(1L, "Bob") - .add(2L, "Trudy") - .build(); - - List target2Records = - TestHelper.RecordsBuilder.newInstance(target2Schema) - .add("Brown", 0L) - .add("Green", 1L) - .add("Pink", 2L) - .build(); - - Table target1 = - testTables.createTable(shell, "target1", target1Schema, fileFormat, ImmutableList.of()); - Table target2 = - testTables.createTable(shell, "target2", target2Schema, fileFormat, ImmutableList.of()); - - // simple insert: should create a single vertex writing to both target tables - shell.executeStatement( - "FROM customers " - + "INSERT INTO target1 SELECT customer_id, first_name " - + "INSERT INTO target2 SELECT last_name, customer_id"); - - // Check that everything is as expected - HiveIcebergTestUtils.validateData(target1, target1Records, 0); - HiveIcebergTestUtils.validateData(target2, target2Records, 1); - - // truncate the target tables - testTables.truncateIcebergTable(target1); - testTables.truncateIcebergTable(target2); - - // complex insert: should use a different vertex for each target table - shell.executeStatement( - "FROM customers " - + "INSERT INTO target1 SELECT customer_id, first_name ORDER BY first_name " - + "INSERT INTO target2 SELECT last_name, customer_id ORDER BY last_name"); - - // Check that everything is as expected - HiveIcebergTestUtils.validateData(target1, target1Records, 0); - HiveIcebergTestUtils.validateData(target2, target2Records, 1); - } - - /** - * Fix vectorized parquet issue-4403. - */ - @TestTemplate - public void testStructMapWithNull() throws IOException { - assumeThat(!("PARQUET".equals(fileFormat.name()) && isVectorized)) - .as("Vectorized parquet throw class cast exception see : issue 4403") - .isTrue(); - Schema schema = - new Schema( - required(1, "id", Types.LongType.get()), - required( - 2, - "mapofstructs", - Types.MapType.ofRequired( - 3, - 4, - Types.StringType.get(), - Types.StructType.of( - required(5, "something", Types.StringType.get()), - required(6, "someone", Types.StringType.get()), - required(7, "somewhere", Types.StringType.get()))))); - - List records = - TestHelper.RecordsBuilder.newInstance(schema).add(0L, ImmutableMap.of()).build(); - - testTables.createTable(shell, "mapwithnull", schema, fileFormat, records); - - List results = - shell.executeStatement("select mapofstructs['context'].someone FROM mapwithnull"); - assertThat(results).hasSize(1); - assertThat(results.get(0)[0]).isNull(); - } - - @TestTemplate - public void testWriteWithDefaultWriteFormat() { - assumeThat( - executionEngine.equals("mr") - && testTableType == TestTables.TestTableType.HIVE_CATALOG - && fileFormat == FileFormat.ORC) - .as("Testing the default file format is enough for a single scenario.") - .isTrue(); - - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - - // create Iceberg table without specifying a write format in the tbl properties - // it should fall back to using the default file format - shell.executeStatement( - String.format( - "CREATE EXTERNAL TABLE %s (id bigint, name string) STORED BY '%s' %s", - identifier, - HiveIcebergStorageHandler.class.getName(), - testTables.locationForCreateTableSQL(identifier))); - - shell.executeStatement(String.format("INSERT INTO %s VALUES (10, 'Linda')", identifier)); - List results = shell.executeStatement(String.format("SELECT * FROM %s", identifier)); - - assertThat(results).hasSize(1); - assertThat(results.get(0)).containsExactly(10L, "Linda"); - } - - @TestTemplate - public void testInsertEmptyResultSet() throws IOException { - Table source = - testTables.createTable( - shell, - "source", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - Table target = - testTables.createTable( - shell, - "target", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - - shell.executeStatement("INSERT INTO target SELECT * FROM source"); - HiveIcebergTestUtils.validateData(target, ImmutableList.of(), 0); - - testTables.appendIcebergTable( - shell.getHiveConf(), - source, - fileFormat, - null, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - shell.executeStatement("INSERT INTO target SELECT * FROM source WHERE first_name = 'Nobody'"); - HiveIcebergTestUtils.validateData(target, ImmutableList.of(), 0); - } - - @TestTemplate - public void testStatsPopulation() throws Exception { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - assumeThat(testTableType) - .as("Only HiveCatalog can remove stats which become obsolete") - .isEqualTo(TestTables.TestTableType.HIVE_CATALOG); - shell.setHiveSessionValue(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, true); - - // create the table using a catalog which supports updating Hive stats (KEEP_HIVE_STATS is true) - shell.setHiveSessionValue(ConfigProperties.KEEP_HIVE_STATS, true); - TableIdentifier identifier = TableIdentifier.of("default", "customers"); - testTables.createTable( - shell, - identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - PartitionSpec.unpartitioned(), - fileFormat, - ImmutableList.of()); - - // insert some data and check the stats are up-to-date - String insert = - testTables.getInsertQuery( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, false); - shell.executeStatement(insert); - String stats = - shell - .metastore() - .getTable(identifier) - .getParameters() - .get(StatsSetupConst.COLUMN_STATS_ACCURATE); - assertThat(stats) - .startsWith("{\"BASIC_STATS\":\"true\""); // it's followed by column stats in Hive3 - - // Create a Catalog where the KEEP_HIVE_STATS is false - shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.FALSE); - TestTables nonHiveTestTables = - HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - Table nonHiveTable = nonHiveTestTables.loadTable(identifier); - - // Append data to the table through a non-Hive engine (in this case, via the java API) -> should - // remove stats - nonHiveTestTables.appendIcebergTable( - shell.getHiveConf(), - nonHiveTable, - fileFormat, - null, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - stats = - shell - .metastore() - .getTable(identifier) - .getParameters() - .get(StatsSetupConst.COLUMN_STATS_ACCURATE); - assertThat(stats).isNull(); - - // insert some data again using Hive catalog, and check the stats are back - shell.executeStatement(insert); - stats = - shell - .metastore() - .getTable(identifier) - .getParameters() - .get(StatsSetupConst.COLUMN_STATS_ACCURATE); - assertThat(stats) - .startsWith("{\"BASIC_STATS\":\"true\""); // it's followed by column stats in Hive3 - } - - /** - * Tests that vectorized ORC reading code path correctly handles when the same ORC file is split - * into multiple parts. Although the split offsets and length will not always include the file - * tail that contains the metadata, the vectorized reader needs to make sure to handle the tail - * reading regardless of the offsets. If this is not done correctly, the last SELECT query will - * fail. - * - * @throws Exception - any test error - */ - @TestTemplate - public void testVectorizedOrcMultipleSplits() throws Exception { - assumeThat(isVectorized && FileFormat.ORC.equals(fileFormat)).isTrue(); - - // This data will be held by a ~870kB ORC file - List records = - TestHelper.generateRandomRecords( - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 20000, 0L); - - // To support splitting the ORC file, we need to specify the stripe size to a small value. It - // looks like the min - // value is about 220kB, no smaller stripes are written by ORC. Anyway, this setting will - // produce 4 stripes. - shell.setHiveSessionValue("orc.stripe.size", "210000"); - - testTables.createTable( - shell, - "targettab", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - records); - - // Will request 4 splits, separated on the exact stripe boundaries within the ORC file. - // (Would request 5 if ORC split generation wouldn't be split (aka stripe) offset aware). - shell.setHiveSessionValue(InputFormatConfig.SPLIT_SIZE, "210000"); - List result = shell.executeStatement("SELECT * FROM targettab ORDER BY last_name"); - - assertThat(result).hasSize(20000); - } - - @TestTemplate - public void testRemoveAndAddBackColumnFromIcebergTable() throws IOException { - assumeThat(isVectorized && FileFormat.PARQUET.equals(fileFormat)).isTrue(); - // Create an Iceberg table with the columns customer_id, first_name and last_name with some - // initial data. - Table icebergTable = - testTables.createTable( - shell, - "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - // Remove the first_name column - icebergTable.updateSchema().deleteColumn("first_name").commit(); - // Add a new column with the name first_name - icebergTable - .updateSchema() - .addColumn("first_name", Types.StringType.get(), "This is new first name") - .commit(); - - // Add new data to the table with the new first_name column filled. - icebergTable = testTables.loadTable(TableIdentifier.of("default", "customers")); - Schema customerSchemaWithNewFirstName = - new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "last_name", Types.StringType.get(), "This is last name"), - optional( - 3, "first_name", Types.StringType.get(), "This is the newly added first name")); - List newCustomersWithNewFirstName = - TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName) - .add(3L, "Red", "James") - .build(); - testTables.appendIcebergTable( - shell.getHiveConf(), icebergTable, fileFormat, null, newCustomersWithNewFirstName); - - TestHelper.RecordsBuilder customersWithNewFirstNameBuilder = - TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName) - .add(0L, "Brown", null) - .add(1L, "Green", null) - .add(2L, "Pink", null) - .add(3L, "Red", "James"); - List customersWithNewFirstName = customersWithNewFirstNameBuilder.build(); - - // Run a 'select *' from Hive and check if the first_name column is returned. - // It should be null for the old data and should be filled in the entry added after the column - // addition. - List rows = shell.executeStatement("SELECT * FROM default.customers"); - HiveIcebergTestUtils.validateData( - customersWithNewFirstName, - HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), - 0); - - Schema customerSchemaWithNewFirstNameOnly = - new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional( - 3, "first_name", Types.StringType.get(), "This is the newly added first name")); - - TestHelper.RecordsBuilder customersWithNewFirstNameOnlyBuilder = - TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstNameOnly) - .add(0L, null) - .add(1L, null) - .add(2L, null) - .add(3L, "James"); - List customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build(); - - // Run a 'select first_name' from Hive to check if the new first-name column can be queried. - rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers"); - HiveIcebergTestUtils.validateData( - customersWithNewFirstNameOnly, - HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), - 0); - } - - @TestTemplate - public void testWriteWithDatePartition() { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Schema dateSchema = - new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "part_field", Types.DateType.get())); - - PartitionSpec spec = PartitionSpec.builderFor(dateSchema).identity("part_field").build(); - List records = - TestHelper.RecordsBuilder.newInstance(dateSchema) - .add(1L, LocalDate.of(2023, 1, 21)) - .add(2L, LocalDate.of(2023, 1, 22)) - .add(3L, LocalDate.of(2022, 1, 21)) - .build(); - testTables.createTable(shell, "part_test", dateSchema, spec, FileFormat.PARQUET, records); - List result = shell.executeStatement("SELECT * from part_test order by id"); - - assertThat(result).hasSameSizeAs(records); - assertThat(result.get(0)[1]).isEqualTo("2023-01-21"); - assertThat(result.get(1)[1]).isEqualTo("2023-01-22"); - assertThat(result.get(2)[1]).isEqualTo("2022-01-21"); - } - - @TestTemplate - public void testWriteWithTimestampPartition() throws IOException { - assumeThat(executionEngine).as("Tez write is not implemented yet").isEqualTo("mr"); - - Schema dateSchema = - new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "part_field", Types.TimestampType.withoutZone())); - PartitionSpec spec = PartitionSpec.builderFor(dateSchema).identity("part_field").build(); - List records = - TestHelper.RecordsBuilder.newInstance(dateSchema) - .add(1L, LocalDateTime.of(2023, 1, 21, 21, 10, 10, 100000000)) - .add(2L, LocalDateTime.of(2023, 1, 21, 22, 10, 10, 200000000)) - .add(3L, LocalDateTime.of(2023, 1, 22, 21, 10, 10, 300000000)) - .build(); - testTables.createTable(shell, "part_test", dateSchema, spec, FileFormat.PARQUET, records); - List result = shell.executeStatement("SELECT * from part_test order by id"); - - assertThat(result).hasSameSizeAs(records); - assertThat(result.get(0)[1]).isEqualTo("2023-01-21 21:10:10.1"); - assertThat(result.get(1)[1]).isEqualTo("2023-01-21 22:10:10.2"); - assertThat(result.get(2)[1]).isEqualTo("2023-01-22 21:10:10.3"); - } - - /** - * Checks if the certain type is an unsupported vectorized types in Hive 3.1.2 - * - * @param type - data type - * @return - true if unsupported - */ - private boolean isUnsupportedVectorizedTypeForHive(Type type) { - if (!isVectorized) { - return false; - } - switch (fileFormat) { - case PARQUET: - return Types.DecimalType.of(3, 1).equals(type) - || type == Types.TimestampType.withoutZone() - || type == Types.TimeType.get(); - case ORC: - return type == Types.TimestampType.withZone() || type == Types.TimeType.get(); - default: - return false; - } - } - - private void testComplexTypeWrite(Schema schema, List records) throws IOException { - String tableName = "complex_table"; - Table table = - testTables.createTable(shell, "complex_table", schema, fileFormat, ImmutableList.of()); - - String dummyTableName = "dummy"; - shell.executeStatement("CREATE TABLE default." + dummyTableName + "(a int)"); - shell.executeStatement("INSERT INTO TABLE default." + dummyTableName + " VALUES(1)"); - records.forEach( - r -> - shell.executeStatement( - insertQueryForComplexType(tableName, dummyTableName, schema, r))); - HiveIcebergTestUtils.validateData(table, records, 0); - } - - private String insertQueryForComplexType( - String tableName, String dummyTableName, Schema schema, Record record) { - StringBuilder query = - new StringBuilder("INSERT INTO TABLE ") - .append(tableName) - .append(" SELECT ") - .append(record.get(0)) - .append(", "); - Type type = schema.asStruct().fields().get(1).type(); - query.append(buildComplexTypeInnerQuery(record.get(1), type)); - query.setLength(query.length() - 1); - query.append(" FROM ").append(dummyTableName).append(" LIMIT 1"); - return query.toString(); - } - - private StringBuilder buildComplexTypeInnerQuery(Object field, Type type) { - StringBuilder query = new StringBuilder(); - if (type instanceof Types.ListType) { - query.append("array("); - List elements = (List) field; - assertThat(elements).as("Hive can not handle empty array() inserts").isNotEmpty(); - Type innerType = ((Types.ListType) type).fields().get(0).type(); - if (!elements.isEmpty()) { - elements.forEach(e -> query.append(buildComplexTypeInnerQuery(e, innerType))); - query.setLength(query.length() - 1); - } - query.append("),"); - } else if (type instanceof Types.MapType) { - query.append("map("); - Map entries = (Map) field; - Type keyType = ((Types.MapType) type).fields().get(0).type(); - Type valueType = ((Types.MapType) type).fields().get(1).type(); - if (!entries.isEmpty()) { - entries - .entrySet() - .forEach( - e -> - query.append( - buildComplexTypeInnerQuery(e.getKey(), keyType) - .append(buildComplexTypeInnerQuery(e.getValue(), valueType)))); - query.setLength(query.length() - 1); - } - query.append("),"); - } else if (type instanceof Types.StructType) { - query.append("named_struct("); - ((GenericRecord) field) - .struct() - .fields() - .forEach( - f -> - query - .append(buildComplexTypeInnerQuery(f.name(), Types.StringType.get())) - .append( - buildComplexTypeInnerQuery( - ((GenericRecord) field).getField(f.name()), f.type()))); - query.setLength(query.length() - 1); - query.append("),"); - } else if (type instanceof Types.StringType) { - if (field != null) { - query.append("'").append(field).append("',"); - } - } else { - throw new RuntimeException("Unsupported type in complex query build."); - } - return query; - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java deleted file mode 100644 index c2cf8f675007..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestHiveIcebergStorageHandlerWithMultipleCatalogs { - - private static final String[] EXECUTION_ENGINES = new String[] {"tez", "mr"}; - private static final String HIVECATALOGNAME = "table1_catalog"; - private static final String OTHERCATALOGNAME = "table2_catalog"; - private static TestHiveShell shell; - - @Parameter(index = 0) - private FileFormat fileFormat1; - - @Parameter(index = 1) - private FileFormat fileFormat2; - - @Parameter(index = 2) - private String executionEngine; - - @Parameter(index = 3) - private TestTables.TestTableType testTableType1; - - @Parameter(index = 4) - private String table1CatalogName; - - @Parameter(index = 5) - private TestTables.TestTableType testTableType2; - - @Parameter(index = 6) - private String table2CatalogName; - - @TempDir private Path temp; - private TestTables testTables1; - private TestTables testTables2; - - @Parameters( - name = - "fileFormat1={0}, fileFormat2={1}, engine={2}, tableType1={3}, catalogName1={4}, " - + "tableType2={5}, catalogName2={6}") - public static Collection parameters() { - Collection testParams = Lists.newArrayList(); - String javaVersion = System.getProperty("java.specification.version"); - - // Run tests with PARQUET and ORC file formats for a two Catalogs - for (String engine : EXECUTION_ENGINES) { - // include Tez tests only for Java 8 - if (javaVersion.equals("1.8") || "mr".equals(engine)) { - for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { - if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add( - new Object[] { - FileFormat.PARQUET, - FileFormat.ORC, - engine, - TestTables.TestTableType.HIVE_CATALOG, - HIVECATALOGNAME, - testTableType, - OTHERCATALOGNAME - }); - } - } - } - } - return testParams; - } - - @BeforeAll - public static void beforeClass() { - shell = HiveIcebergStorageHandlerTestUtils.shell(); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - testTables1 = - HiveIcebergStorageHandlerTestUtils.testTables( - shell, testTableType1, temp, table1CatalogName); - HiveIcebergStorageHandlerTestUtils.init(shell, testTables1, temp, executionEngine); - testTables1 - .properties() - .entrySet() - .forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); - - testTables2 = - HiveIcebergStorageHandlerTestUtils.testTables( - shell, testTableType2, temp, table2CatalogName); - testTables2 - .properties() - .entrySet() - .forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - } - - @TestTemplate - public void testJoinTablesFromDifferentCatalogs() throws IOException { - createAndAddRecords( - testTables1, - fileFormat1, - TableIdentifier.of("default", "customers1"), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - createAndAddRecords( - testTables2, - fileFormat2, - TableIdentifier.of("default", "customers2"), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - List rows = - shell.executeStatement( - "SELECT c2.customer_id, c2.first_name, c2.last_name " - + "FROM default.customers2 c2 JOIN default.customers1 c1 ON c2.customer_id = c1.customer_id " - + "ORDER BY c2.customer_id"); - assertThat(rows).hasSameSizeAs(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - HiveIcebergTestUtils.validateData( - Lists.newArrayList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS), - HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), - 0); - } - - private void createAndAddRecords( - TestTables testTables, - FileFormat fileFormat, - TableIdentifier identifier, - List records) - throws IOException { - String createSql = - "CREATE EXTERNAL TABLE " - + identifier - + " (customer_id BIGINT, first_name STRING, last_name STRING)" - + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " - + testTables.locationForCreateTableSQL(identifier) - + " TBLPROPERTIES ('" - + InputFormatConfig.CATALOG_NAME - + "'='" - + testTables.catalogName() - + "')"; - shell.executeStatement(createSql); - Table icebergTable = testTables.loadTable(identifier); - testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, null, records); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java deleted file mode 100644 index 6b3bddd637c2..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergWithHiveAutogatherEnable.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.ConfigProperties; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestHiveIcebergWithHiveAutogatherEnable { - - @Parameters(name = "fileFormat={0}, catalog={1}") - public static Collection parameters() { - Collection testParams = Lists.newArrayList(); - // Run tests with every FileFormat for a single Catalog (HiveCatalog) - for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) { - testParams.add(new Object[] {fileFormat, TestTables.TestTableType.HIVE_CATALOG}); - } - return testParams; - } - - private static TestHiveShell shell; - - private TestTables testTables; - - @Parameter(index = 0) - private FileFormat fileFormat; - - @Parameter(index = 1) - private TestTables.TestTableType testTableType; - - @TempDir private Path temp; - - @BeforeAll - public static void beforeClass() { - // The hive configuration HIVESTATSAUTOGATHER must be set to true from hive engine - shell = - HiveIcebergStorageHandlerTestUtils.shell( - ImmutableMap.of(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, "true")); - } - - @AfterAll - public static void afterClass() throws Exception { - shell.stop(); - } - - @BeforeEach - public void before() throws IOException { - testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "mr"); - } - - @AfterEach - public void after() throws Exception { - HiveIcebergStorageHandlerTestUtils.close(shell); - } - - @TestTemplate - public void testHiveStatsAutogatherWhenCreateNewTable() throws Exception { - // Create a Catalog where the KEEP_HIVE_STATS is false - shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.FALSE); - TestTables hiveStatsDisabledTestTables = - HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - - TableIdentifier identifierWithoutStats = - TableIdentifier.of("default", "customers_without_stats"); - - // To validate the stats augother is disabled from Hive engine, the creation of iceberg table - // cannot have any records. Otherwise, the table parameters TOTAL_SIZE and NUM_FILES are - // added by Iceberg when inserting records. - hiveStatsDisabledTestTables.createTable( - shell, - identifierWithoutStats.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - - // The table parameter TOTAL_SIZE is removed from hive engine - String totalSize = - shell - .metastore() - .getTable(identifierWithoutStats) - .getParameters() - .get(StatsSetupConst.TOTAL_SIZE); - assertThat(totalSize).isNull(); - - // The table parameter NUM_FILES is removed from hive engine - String numFiles = - shell - .metastore() - .getTable(identifierWithoutStats) - .getParameters() - .get(StatsSetupConst.NUM_FILES); - assertThat(numFiles).isNull(); - - // The table parameter DO_NOT_UPDATE_STATS is removed from hive engine - String stats = - shell - .metastore() - .getTable(identifierWithoutStats) - .getParameters() - .get(StatsSetupConst.DO_NOT_UPDATE_STATS); - assertThat(stats).isNull(); - - // Create a Catalog where the KEEP_HIVE_STATS is true - shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.TRUE); - TestTables keepHiveStatsTestTables = - HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - - TableIdentifier identifierWithStats = TableIdentifier.of("default", "customers_with_stats"); - - // To validate the stats augother is enabled from Hive engine, the creation of iceberg table - // cannot have any records. Otherwise, the table parameters TOTAL_SIZE and NUM_FILES are - // added by Iceberg when inserting records. - keepHiveStatsTestTables.createTable( - shell, - identifierWithStats.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, - ImmutableList.of()); - - // The table parameter DO_NOT_UPDATE_STATS doesn't exist - stats = - shell - .metastore() - .getTable(identifierWithStats) - .getParameters() - .get(StatsSetupConst.DO_NOT_UPDATE_STATS); - assertThat(stats).isNull(); - - // The table parameter NUM_FILES is gathered from hive engine - numFiles = - shell - .metastore() - .getTable(identifierWithStats) - .getParameters() - .get(StatsSetupConst.NUM_FILES); - assertThat(numFiles).isEqualTo("1"); - - // The table parameter TOTAL_SIZE is gathered from hive engine - numFiles = - shell - .metastore() - .getTable(identifierWithStats) - .getParameters() - .get(StatsSetupConst.TOTAL_SIZE); - assertThat(numFiles).isNotNull(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java deleted file mode 100644 index 7396350613da..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import java.util.Collections; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; -import org.apache.hive.service.cli.CLIService; -import org.apache.hive.service.cli.HiveSQLException; -import org.apache.hive.service.cli.OperationHandle; -import org.apache.hive.service.cli.RowSet; -import org.apache.hive.service.cli.SessionHandle; -import org.apache.hive.service.cli.session.HiveSession; -import org.apache.hive.service.server.HiveServer2; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** - * Test class for running HiveQL queries, essentially acting like a Beeline shell in tests. - * - *

It spins up both an HS2 and a Metastore instance to work with. The shell will only accept - * queries if it has been previously initialized via {@link #start()}, and a session has been opened - * via {@link #openSession()}. Prior to calling {@link #start()}, the shell should first be - * configured with props that apply across all test cases by calling {@link - * #setHiveConfValue(String, String)} ()}. On the other hand, session-level conf can be applied - * anytime via {@link #setHiveSessionValue(String, String)} ()}, once we've opened an active - * session. - */ -public class TestHiveShell { - - private final TestHiveMetastore metastore; - private final HiveServer2 hs2; - private final HiveConf hs2Conf; - private CLIService client; - private HiveSession session; - private boolean started; - - public TestHiveShell() { - metastore = new TestHiveMetastore(); - hs2Conf = initializeConf(); - hs2 = new HiveServer2(); - } - - public void setHiveConfValue(String key, String value) { - Preconditions.checkState( - !started, "TestHiveShell has already been started. Cannot set Hive conf anymore."); - hs2Conf.verifyAndSet(key, value); - } - - public void setHiveSessionValue(String key, String value) { - Preconditions.checkState(session != null, "There is no open session for setting variables."); - try { - session.getSessionConf().set(key, value); - } catch (Exception e) { - throw new RuntimeException("Unable to set Hive session variable: ", e); - } - } - - public void setHiveSessionValue(String key, boolean value) { - setHiveSessionValue(key, Boolean.toString(value)); - } - - public void start() { - // Create a copy of the HiveConf for the metastore - metastore.start(new HiveConf(hs2Conf), 10); - hs2Conf.setVar( - HiveConf.ConfVars.METASTOREURIS, - metastore.hiveConf().getVar(HiveConf.ConfVars.METASTOREURIS)); - hs2Conf.setVar( - HiveConf.ConfVars.METASTOREWAREHOUSE, - metastore.hiveConf().getVar(HiveConf.ConfVars.METASTOREWAREHOUSE)); - - // Initializing RpcMetrics in a single JVM multiple times can cause issues - DefaultMetricsSystem.setMiniClusterMode(true); - - hs2.init(hs2Conf); - hs2.start(); - client = - hs2.getServices().stream() - .filter(CLIService.class::isInstance) - .findFirst() - .map(CLIService.class::cast) - .get(); - started = true; - } - - public void stop() throws Exception { - if (client != null) { - client.stop(); - } - hs2.stop(); - metastore.stop(); - started = false; - } - - public TestHiveMetastore metastore() { - return metastore; - } - - public void openSession() { - Preconditions.checkState( - started, "You have to start TestHiveShell first, before opening a session."); - try { - SessionHandle sessionHandle = - client - .getSessionManager() - .openSession(CLIService.SERVER_VERSION, "", "", "127.0.0.1", Collections.emptyMap()); - session = client.getSessionManager().getSession(sessionHandle); - } catch (Exception e) { - throw new RuntimeException("Unable to open new Hive session: ", e); - } - } - - public void closeSession() { - Preconditions.checkState(session != null, "There is no open session to be closed."); - try { - session.close(); - session = null; - } catch (Exception e) { - throw new RuntimeException("Unable to close Hive session: ", e); - } - } - - public List executeStatement(String statement) { - Preconditions.checkState( - session != null, - "You have to start TestHiveShell and open a session first, before running a query."); - try { - OperationHandle handle = - client.executeStatement(session.getSessionHandle(), statement, Collections.emptyMap()); - List resultSet = Lists.newArrayList(); - if (handle.hasResultSet()) { - RowSet rowSet; - // keep fetching results until we can - while ((rowSet = client.fetchResults(handle)) != null && rowSet.numRows() > 0) { - for (Object[] row : rowSet) { - resultSet.add(row.clone()); - } - } - } - return resultSet; - } catch (HiveSQLException e) { - throw new IllegalArgumentException( - "Failed to execute Hive query '" + statement + "': " + e.getMessage(), e); - } - } - - public Configuration getHiveConf() { - if (session != null) { - return session.getHiveConf(); - } else { - return hs2Conf; - } - } - - private HiveConf initializeConf() { - HiveConf hiveConf = new HiveConf(); - - // Use ephemeral port to enable running tests in parallel - hiveConf.setIntVar(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_PORT, 0); - // Disable the web UI - hiveConf.setIntVar(HiveConf.ConfVars.HIVE_SERVER2_WEBUI_PORT, -1); - - // Switch off optimizers in order to contain the map reduction within this JVM - hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_CBO_ENABLED, false); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT, false); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES, false); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER, false); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN, false); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVESKEWJOIN, false); - - // Speed up test execution - hiveConf.setLongVar(HiveConf.ConfVars.HIVECOUNTERSPULLINTERVAL, 1L); - hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER, false); - - // Resource configuration - hiveConf.setInt("mapreduce.map.memory.mb", 1024); - - // Tez configuration - hiveConf.setBoolean("tez.local.mode", true); - - // Disable vectorization for HiveIcebergInputFormat - hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); - - // do not serialize the FileIO config - hiveConf.set(InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, "true"); - - return hiveConf; - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java deleted file mode 100644 index f2710290d5c2..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java +++ /dev/null @@ -1,598 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.Timestamp; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.Tables; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.mr.Catalogs; -import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.mr.TestCatalogs; -import org.apache.iceberg.mr.TestHelper; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -// Helper class for setting up and testing various catalog implementations -abstract class TestTables { - public static final TestTableType[] ALL_TABLE_TYPES = - new TestTableType[] { - TestTableType.HADOOP_TABLE, - TestTableType.HADOOP_CATALOG, - TestTableType.CUSTOM_CATALOG, - TestTableType.HIVE_CATALOG - }; - - private final Tables tables; - protected final Path temp; - protected final String catalog; - - protected TestTables(Tables tables, Path temp, String catalogName) { - this.tables = tables; - this.temp = temp; - this.catalog = catalogName; - } - - protected TestTables(Catalog catalog, Path temp, String catalogName) { - this(new CatalogToTables(catalog), temp, catalogName); - } - - public Map properties() { - return Collections.emptyMap(); - } - - // For HadoopTables this method will return a temporary location - public String identifier(String tableIdentifier) { - return tableIdentifier; - } - - public Tables tables() { - return tables; - } - - public String catalogName() { - return catalog; - } - - /** - * The location string needed to be provided for CREATE TABLE ... commands, like "LOCATION - * 'file:///tmp/warehouse/default/tablename'. Empty ("") if LOCATION is not needed. - * - * @param identifier The table identifier - * @return The location string for create table operation - */ - public abstract String locationForCreateTableSQL(TableIdentifier identifier); - - /** - * The table properties string needed for the CREATE TABLE ... commands, like {@code - * TBLPROPERTIES('iceberg.catalog'='mycatalog')} - * - * @return the tables properties string, such as {@code - * TBLPROPERTIES('iceberg.catalog'='mycatalog')} - */ - public String propertiesForCreateTableSQL(Map tableProperties) { - Map properties = Maps.newHashMap(tableProperties); - properties.putIfAbsent(InputFormatConfig.CATALOG_NAME, catalog); - String props = - properties.entrySet().stream() - .map(entry -> String.format("'%s'='%s'", entry.getKey(), entry.getValue())) - .collect(Collectors.joining(",")); - return " TBLPROPERTIES (" + props + ")"; - } - - /** - * If an independent Hive table creation is needed for the given Catalog then this should return - * the Hive SQL string which we have to execute. Overridden for HiveCatalog where the Hive table - * is immediately created during the Iceberg table creation so no extra sql execution is required. - * - * @param identifier The table identifier (the namespace should be non-empty and single level) - * @param tableProps Optional map of table properties - * @return The SQL string - which should be executed, null - if it is not needed. - */ - public String createHiveTableSQL(TableIdentifier identifier, Map tableProps) { - Preconditions.checkArgument(!identifier.namespace().isEmpty(), "Namespace should not be empty"); - Preconditions.checkArgument( - identifier.namespace().levels().length == 1, "Namespace should be single level"); - return String.format( - "CREATE TABLE %s.%s STORED BY '%s' %s %s", - identifier.namespace(), - identifier.name(), - HiveIcebergStorageHandler.class.getName(), - locationForCreateTableSQL(identifier), - propertiesForCreateTableSQL(tableProps)); - } - - /** - * Loads the given table from the actual catalog. Overridden by HadoopTables, since the parameter - * of the {@link Tables#load(String)} should be the full path of the table metadata directory - * - * @param identifier The table we want to load - * @return The Table loaded from the Catalog - */ - public Table loadTable(TableIdentifier identifier) { - return tables.load(identifier.toString()); - } - - /** - * Creates an non partitioned Hive test table. Creates the Iceberg table/data and creates the - * corresponding Hive table as well when needed. The table will be in the 'default' database. The - * table will be populated with the provided List of {@link Record}s. - * - * @param shell The HiveShell used for Hive table creation - * @param tableName The name of the test table - * @param schema The schema used for the table creation - * @param fileFormat The file format used for writing the data - * @param records The records with which the table is populated - * @return The created table - * @throws IOException If there is an error writing data - */ - public Table createTable( - TestHiveShell shell, - String tableName, - Schema schema, - FileFormat fileFormat, - List records) - throws IOException { - Table table = createIcebergTable(shell.getHiveConf(), tableName, schema, fileFormat, records); - String createHiveSQL = - createHiveTableSQL(TableIdentifier.of("default", tableName), ImmutableMap.of()); - if (createHiveSQL != null) { - shell.executeStatement(createHiveSQL); - } - - return table; - } - - /** - * Creates a partitioned Hive test table using Hive SQL. The table will be in the 'default' - * database. The table will be populated with the provided List of {@link Record}s using a Hive - * insert statement. - * - * @param shell The HiveShell used for Hive table creation - * @param tableName The name of the test table - * @param schema The schema used for the table creation - * @param spec The partition specification for the table - * @param fileFormat The file format used for writing the data - * @param records The records with which the table is populated - * @return The created table - * @throws IOException If there is an error writing data - */ - public Table createTable( - TestHiveShell shell, - String tableName, - Schema schema, - PartitionSpec spec, - FileFormat fileFormat, - List records) { - TableIdentifier identifier = TableIdentifier.of("default", tableName); - shell.executeStatement( - "CREATE EXTERNAL TABLE " - + identifier - + " STORED BY '" - + HiveIcebergStorageHandler.class.getName() - + "' " - + locationForCreateTableSQL(identifier) - + "TBLPROPERTIES ('" - + InputFormatConfig.TABLE_SCHEMA - + "'='" - + SchemaParser.toJson(schema) - + "', " - + "'" - + InputFormatConfig.PARTITION_SPEC - + "'='" - + PartitionSpecParser.toJson(spec) - + "', " - + "'" - + TableProperties.DEFAULT_FILE_FORMAT - + "'='" - + fileFormat - + "', " - + "'" - + InputFormatConfig.CATALOG_NAME - + "'='" - + catalogName() - + "')"); - - if (records != null && !records.isEmpty()) { - StringBuilder query = new StringBuilder().append("INSERT INTO " + identifier + " VALUES "); - - records.forEach( - record -> { - query.append("("); - query.append( - record.struct().fields().stream() - .map( - field -> - getStringValueForInsert(record.getField(field.name()), field.type())) - .collect(Collectors.joining(","))); - query.append("),"); - }); - query.setLength(query.length() - 1); - - shell.executeStatement(query.toString()); - } - - return loadTable(identifier); - } - - public String getInsertQuery( - List records, TableIdentifier identifier, boolean isOverwrite) { - StringBuilder query = - new StringBuilder( - String.format( - "INSERT %s %s VALUES ", isOverwrite ? "OVERWRITE TABLE" : "INTO", identifier)); - - records.forEach( - record -> { - query.append("("); - query.append( - record.struct().fields().stream() - .map( - field -> getStringValueForInsert(record.getField(field.name()), field.type())) - .collect(Collectors.joining(","))); - query.append("),"); - }); - query.setLength(query.length() - 1); - return query.toString(); - } - - /** - * Creates a Hive test table. Creates the Iceberg table/data and creates the corresponding Hive - * table as well when needed. The table will be in the 'default' database. The table will be - * populated with the provided with randomly generated {@link Record}s. - * - * @param shell The HiveShell used for Hive table creation - * @param tableName The name of the test table - * @param schema The schema used for the table creation - * @param fileFormat The file format used for writing the data - * @param numRecords The number of records should be generated and stored in the table - * @throws IOException If there is an error writing data - */ - public List createTableWithGeneratedRecords( - TestHiveShell shell, String tableName, Schema schema, FileFormat fileFormat, int numRecords) - throws IOException { - List records = TestHelper.generateRandomRecords(schema, numRecords, 0L); - createTable(shell, tableName, schema, fileFormat, records); - return records; - } - - /** - * Creates an Iceberg table/data without creating the corresponding Hive table. The table will be - * in the 'default' namespace. - * - * @param configuration The configuration used during the table creation - * @param tableName The name of the test table - * @param schema The schema used for the table creation - * @param fileFormat The file format used for writing the data - * @param records The records with which the table is populated - * @return The create table - * @throws IOException If there is an error writing data - */ - public Table createIcebergTable( - Configuration configuration, - String tableName, - Schema schema, - FileFormat fileFormat, - List records) - throws IOException { - String identifier = identifier("default." + tableName); - TestHelper helper = - new TestHelper( - new Configuration(configuration), - tables(), - identifier, - schema, - PartitionSpec.unpartitioned(), - fileFormat, - temp); - Table table = helper.createTable(); - - if (records != null && !records.isEmpty()) { - helper.appendToTable(helper.writeFile(null, records)); - } - - return table; - } - - /** - * Append more data to the table. - * - * @param configuration The configuration used during the table creation - * @param table The table to append - * @param format The file format used for writing the data - * @param partition The partition to write to - * @param records The records with which should be added to the table - * @throws IOException If there is an error writing data - */ - public void appendIcebergTable( - Configuration configuration, - Table table, - FileFormat format, - StructLike partition, - List records) - throws IOException { - TestHelper helper = new TestHelper(configuration, null, null, null, null, format, temp); - - helper.setTable(table); - if (!records.isEmpty()) { - helper.appendToTable(helper.writeFile(partition, records)); - } - } - - /** - * Truncates an Iceberg table. - * - * @param table The iceberg table to truncate - */ - public void truncateIcebergTable(Table table) { - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - } - - private static class CatalogToTables implements Tables { - - private final Catalog catalog; - - private CatalogToTables(Catalog catalog) { - this.catalog = catalog; - } - - @Override - public Table create( - Schema schema, - PartitionSpec spec, - SortOrder sortOrder, - Map properties, - String tableIdentifier) { - TableIdentifier tableIdent = TableIdentifier.parse(tableIdentifier); - return catalog - .buildTable(tableIdent, schema) - .withPartitionSpec(spec) - .withSortOrder(sortOrder) - .withProperties(properties) - .create(); - } - - @Override - public Table load(String tableIdentifier) { - return catalog.loadTable(TableIdentifier.parse(tableIdentifier)); - } - - @Override - public boolean exists(String tableIdentifier) { - return catalog.tableExists(TableIdentifier.parse(tableIdentifier)); - } - } - - static class CustomCatalogTestTables extends TestTables { - - private final String warehouseLocation; - - CustomCatalogTestTables(Configuration conf, Path temp, String catalogName) throws IOException { - this( - conf, - temp, - (HiveVersion.min(HiveVersion.HIVE_3) ? "file:" : "") - + temp.resolve(Paths.get("custom", "warehouse")), - catalogName); - } - - CustomCatalogTestTables( - Configuration conf, Path temp, String warehouseLocation, String catalogName) { - super(new TestCatalogs.CustomHadoopCatalog(conf, warehouseLocation), temp, catalogName); - this.warehouseLocation = warehouseLocation; - } - - @Override - public Map properties() { - return ImmutableMap.of( - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.CATALOG_IMPL), - TestCatalogs.CustomHadoopCatalog.class.getName(), - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.WAREHOUSE_LOCATION), - warehouseLocation); - } - - @Override - public String locationForCreateTableSQL(TableIdentifier identifier) { - return "LOCATION '" + warehouseLocation + TestTables.tablePath(identifier) + "' "; - } - } - - static class HadoopCatalogTestTables extends TestTables { - - private final String warehouseLocation; - - HadoopCatalogTestTables(Configuration conf, Path temp, String catalogName) throws IOException { - this( - conf, - temp, - (HiveVersion.min(HiveVersion.HIVE_3) ? "file:" : "") - + temp.resolve(Paths.get("hadoop", "warehouse")), - catalogName); - } - - HadoopCatalogTestTables( - Configuration conf, Path temp, String warehouseLocation, String catalogName) { - super(new HadoopCatalog(conf, warehouseLocation), temp, catalogName); - this.warehouseLocation = warehouseLocation; - } - - @Override - public Map properties() { - return ImmutableMap.of( - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogUtil.ICEBERG_CATALOG_TYPE), - CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP, - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.WAREHOUSE_LOCATION), - warehouseLocation); - } - - @Override - public String locationForCreateTableSQL(TableIdentifier identifier) { - return "LOCATION '" + warehouseLocation + TestTables.tablePath(identifier) + "' "; - } - } - - static class HadoopTestTables extends TestTables { - - HadoopTestTables(Configuration conf, Path temp) { - super(new HadoopTables(conf), temp, Catalogs.ICEBERG_HADOOP_TABLE_NAME); - } - - @Override - public String identifier(String tableIdentifier) { - final File location; - - TableIdentifier identifier = TableIdentifier.parse(tableIdentifier); - location = - temp.resolve( - Joiner.on(File.separator).join(identifier.namespace().levels()) - + File.separator - + identifier.name()) - .toFile(); - - assertThat(location).doesNotExist(); - return location.toString(); - } - - @Override - public String locationForCreateTableSQL(TableIdentifier identifier) { - return "LOCATION '" + temp + tablePath(identifier) + "' "; - } - - @Override - public Table loadTable(TableIdentifier identifier) { - return tables().load(temp + TestTables.tablePath(identifier)); - } - } - - static class HiveTestTables extends TestTables { - - HiveTestTables(Configuration conf, Path temp, String catalogName) { - super( - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), - CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, - ImmutableMap.of(), - conf), - temp, - catalogName); - } - - @Override - public Map properties() { - return ImmutableMap.of( - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogUtil.ICEBERG_CATALOG_TYPE), - CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE); - } - - @Override - public String locationForCreateTableSQL(TableIdentifier identifier) { - return ""; - } - - @Override - public String createHiveTableSQL(TableIdentifier identifier, Map tblProps) { - return null; - } - } - - private static String tablePath(TableIdentifier identifier) { - return "/" + Joiner.on("/").join(identifier.namespace().levels()) + "/" + identifier.name(); - } - - private String getStringValueForInsert(Object value, Type type) { - String template = "\'%s\'"; - if (type.equals(Types.TimestampType.withoutZone())) { - return String.format(template, Timestamp.valueOf((LocalDateTime) value)); - } else if (type.equals(Types.TimestampType.withZone())) { - return String.format(template, Timestamp.from(((OffsetDateTime) value).toInstant())); - } else if (type.equals(Types.BooleanType.get())) { - // in hive2 boolean type values must not be surrounded in apostrophes. Otherwise the value is - // translated to true. - return value.toString(); - } else { - return String.format(template, value.toString()); - } - } - - enum TestTableType { - HADOOP_TABLE { - @Override - public TestTables instance(Configuration conf, Path temporaryFolder, String catalogName) { - return new HadoopTestTables(conf, temporaryFolder); - } - }, - HADOOP_CATALOG { - @Override - public TestTables instance(Configuration conf, Path temporaryFolder, String catalogName) - throws IOException { - return new HadoopCatalogTestTables(conf, temporaryFolder, catalogName); - } - }, - CUSTOM_CATALOG { - @Override - public TestTables instance(Configuration conf, Path temporaryFolder, String catalogName) - throws IOException { - return new CustomCatalogTestTables(conf, temporaryFolder, catalogName); - } - }, - HIVE_CATALOG { - @Override - public TestTables instance(Configuration conf, Path temporaryFolder, String catalogName) { - return new HiveTestTables(conf, temporaryFolder, catalogName); - } - }; - - public abstract TestTables instance( - Configuration conf, Path temporaryFolder, String catalogName) throws IOException; - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java deleted file mode 100644 index 87fe4208c964..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.ByteBuffer; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.junit.jupiter.api.Test; - -public class TestIcebergBinaryObjectInspector { - - @Test - public void testIcebergByteBufferObjectInspector() { - BinaryObjectInspector oi = IcebergBinaryObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.BINARY); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.binaryTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.binaryTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(byte[].class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(BytesWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - - byte[] bytes = new byte[] {0, 1, 2, 3}; - - ByteBuffer buffer = ByteBuffer.wrap(bytes); - assertThat(oi.getPrimitiveJavaObject(buffer)).isEqualTo(bytes); - assertThat(oi.getPrimitiveWritableObject(buffer)).isEqualTo(new BytesWritable(bytes)); - - ByteBuffer slice = ByteBuffer.wrap(bytes, 1, 2).slice(); - assertThat(oi.getPrimitiveJavaObject(slice)).isEqualTo(new byte[] {1, 2}); - assertThat(oi.getPrimitiveWritableObject(slice)) - .isEqualTo(new BytesWritable(new byte[] {1, 2})); - - slice.position(1); - assertThat(oi.getPrimitiveJavaObject(slice)).isEqualTo(new byte[] {2}); - assertThat(oi.getPrimitiveWritableObject(slice)).isEqualTo(new BytesWritable(new byte[] {2})); - - byte[] copy = (byte[]) oi.copyObject(bytes); - - assertThat(copy).isEqualTo(bytes); - assertThat(copy).isNotSameAs(bytes); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java deleted file mode 100644 index 6e03fae861f9..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.sql.Date; -import java.time.LocalDate; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.junit.jupiter.api.Test; - -public class TestIcebergDateObjectInspector { - - @Test - public void testIcebergDateObjectInspector() { - DateObjectInspector oi = IcebergDateObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.DATE); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.dateTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.dateTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(Date.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(DateWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - - LocalDate local = LocalDate.of(2020, 1, 1); - Date date = Date.valueOf("2020-01-01"); - - assertThat(oi.getPrimitiveJavaObject(local)).isEqualTo(date); - assertThat(oi.getPrimitiveWritableObject(local)).isEqualTo(new DateWritable(date)); - - Date copy = (Date) oi.copyObject(date); - - assertThat(copy).isEqualTo(date); - assertThat(copy).isNotSameAs(date); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java deleted file mode 100644 index 58d43e3d7047..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.math.BigDecimal; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.junit.jupiter.api.Test; - -public class TestIcebergDecimalObjectInspector { - - @Test - public void testCache() { - HiveDecimalObjectInspector oi = IcebergDecimalObjectInspector.get(38, 18); - - assertThat(IcebergDecimalObjectInspector.get(38, 18)).isSameAs(oi); - assertThat(IcebergDecimalObjectInspector.get(28, 18)).isNotSameAs(oi); - assertThat(IcebergDecimalObjectInspector.get(38, 28)).isNotSameAs(oi); - } - - @Test - public void testIcebergDecimalObjectInspector() { - HiveDecimalObjectInspector oi = IcebergDecimalObjectInspector.get(38, 18); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.DECIMAL); - - assertThat(oi.getTypeInfo()).isEqualTo(new DecimalTypeInfo(38, 18)); - assertThat(oi.getTypeName()) - .isEqualTo(TypeInfoFactory.decimalTypeInfo.getTypeName(), oi.getTypeName()); - - assertThat(oi.precision()).isEqualTo(38); - assertThat(oi.scale()).isEqualTo(18); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(HiveDecimal.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(HiveDecimalWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - - HiveDecimal one = HiveDecimal.create(BigDecimal.ONE); - - assertThat(oi.getPrimitiveJavaObject(BigDecimal.ONE)).isEqualTo(one); - assertThat(oi.getPrimitiveWritableObject(BigDecimal.ONE)) - .isEqualTo(new HiveDecimalWritable(one)); - - HiveDecimal copy = (HiveDecimal) oi.copyObject(one); - - assertThat(copy).isEqualTo(one); - assertThat(copy).isNotSameAs(one); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java deleted file mode 100644 index 7a8450f6852f..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.junit.jupiter.api.Test; - -public class TestIcebergFixedObjectInspector { - - @Test - public void testIcebergFixedObjectInspector() { - IcebergFixedObjectInspector oi = IcebergFixedObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.BINARY); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.binaryTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.binaryTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(byte[].class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(BytesWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - assertThat(oi.convert(null)).isNull(); - - byte[] bytes = new byte[] {0, 1}; - BytesWritable bytesWritable = new BytesWritable(bytes); - - assertThat(oi.getPrimitiveJavaObject(bytes)).isEqualTo(bytes); - assertThat(oi.getPrimitiveWritableObject(bytes)).isEqualTo(bytesWritable); - assertThat(oi.convert(bytes)).isEqualTo(bytes); - - byte[] copy = (byte[]) oi.copyObject(bytes); - - assertThat(copy).isEqualTo(bytes); - assertThat(copy).isNotSameAs(bytes); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java deleted file mode 100644 index c2646376890c..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.iceberg.Schema; -import org.apache.iceberg.hive.HiveVersion; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestIcebergObjectInspector { - - private final Schema schema = - new Schema( - required(1, "binary_field", Types.BinaryType.get(), "binary comment"), - required(2, "boolean_field", Types.BooleanType.get(), "boolean comment"), - required(3, "date_field", Types.DateType.get(), "date comment"), - required(4, "decimal_field", Types.DecimalType.of(38, 18), "decimal comment"), - required(5, "double_field", Types.DoubleType.get(), "double comment"), - required(6, "fixed_field", Types.FixedType.ofLength(3), "fixed comment"), - required(7, "float_field", Types.FloatType.get(), "float comment"), - required(8, "integer_field", Types.IntegerType.get(), "integer comment"), - required(9, "long_field", Types.LongType.get(), "long comment"), - required(10, "string_field", Types.StringType.get(), "string comment"), - required(11, "timestamp_field", Types.TimestampType.withoutZone(), "timestamp comment"), - required(12, "timestamptz_field", Types.TimestampType.withZone(), "timestamptz comment"), - required(13, "uuid_field", Types.UUIDType.get(), "uuid comment"), - required( - 14, - "list_field", - Types.ListType.ofRequired(15, Types.StringType.get()), - "list comment"), - required( - 16, - "map_field", - Types.MapType.ofRequired(17, 18, Types.StringType.get(), Types.IntegerType.get()), - "map comment"), - required( - 19, - "struct_field", - Types.StructType.of( - Types.NestedField.required( - 20, "nested_field", Types.StringType.get(), "nested field comment")), - "struct comment"), - required(21, "time_field", Types.TimeType.get(), "time comment")); - - @SuppressWarnings("MethodLength") - @Test - public void testIcebergObjectInspector() { - ObjectInspector oi = IcebergObjectInspector.create(schema); - assertThat(oi).isNotNull(); - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.STRUCT); - - StructObjectInspector soi = (StructObjectInspector) oi; - - // binary - StructField binaryField = soi.getStructFieldRef("binary_field"); - assertThat(binaryField.getFieldID()).isEqualTo(1); - assertThat(binaryField.getFieldName()).isEqualTo("binary_field"); - assertThat(binaryField.getFieldComment()).isEqualTo("binary comment"); - assertThat(binaryField.getFieldObjectInspector()).isEqualTo(IcebergBinaryObjectInspector.get()); - - // boolean - StructField booleanField = soi.getStructFieldRef("boolean_field"); - assertThat(booleanField.getFieldID()).isEqualTo(2); - assertThat(booleanField.getFieldName()).isEqualTo("boolean_field"); - assertThat(booleanField.getFieldComment()).isEqualTo("boolean comment"); - assertThat(booleanField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(boolean.class)); - - // date - StructField dateField = soi.getStructFieldRef("date_field"); - assertThat(dateField.getFieldID()).isEqualTo(3); - assertThat(dateField.getFieldName()).isEqualTo("date_field"); - assertThat(dateField.getFieldComment()).isEqualTo("date comment"); - if (HiveVersion.min(HiveVersion.HIVE_3)) { - assertThat(dateField.getFieldObjectInspector().getClass().getName()) - .isEqualTo( - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3"); - } else { - assertThat(dateField.getFieldObjectInspector().getClass().getName()) - .isEqualTo("org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector"); - } - - // decimal - StructField decimalField = soi.getStructFieldRef("decimal_field"); - assertThat(decimalField.getFieldID()).isEqualTo(4); - assertThat(decimalField.getFieldName()).isEqualTo("decimal_field"); - assertThat(decimalField.getFieldComment()).isEqualTo("decimal comment"); - assertThat(decimalField.getFieldObjectInspector()) - .isEqualTo(IcebergDecimalObjectInspector.get(38, 18)); - - // double - StructField doubleField = soi.getStructFieldRef("double_field"); - assertThat(doubleField.getFieldID()).isEqualTo(5); - assertThat(doubleField.getFieldName()).isEqualTo("double_field"); - assertThat(doubleField.getFieldComment()).isEqualTo("double comment"); - assertThat(doubleField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(double.class)); - - // fixed - StructField fixedField = soi.getStructFieldRef("fixed_field"); - assertThat(fixedField.getFieldID()).isEqualTo(6); - assertThat(fixedField.getFieldName()).isEqualTo("fixed_field"); - assertThat(fixedField.getFieldComment()).isEqualTo("fixed comment"); - assertThat(fixedField.getFieldObjectInspector()).isEqualTo(IcebergFixedObjectInspector.get()); - - // float - StructField floatField = soi.getStructFieldRef("float_field"); - assertThat(floatField.getFieldID()).isEqualTo(7); - assertThat(floatField.getFieldName()).isEqualTo("float_field"); - assertThat(floatField.getFieldComment()).isEqualTo("float comment"); - assertThat(floatField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(float.class)); - - // integer - StructField integerField = soi.getStructFieldRef("integer_field"); - assertThat(integerField.getFieldID()).isEqualTo(8); - assertThat(integerField.getFieldName()).isEqualTo("integer_field"); - assertThat(integerField.getFieldComment()).isEqualTo("integer comment"); - assertThat(integerField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(int.class)); - - // long - StructField longField = soi.getStructFieldRef("long_field"); - assertThat(longField.getFieldID()).isEqualTo(9); - assertThat(longField.getFieldName()).isEqualTo("long_field"); - assertThat(longField.getFieldComment()).isEqualTo("long comment"); - assertThat(longField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(long.class)); - - // string - StructField stringField = soi.getStructFieldRef("string_field"); - assertThat(stringField.getFieldID()).isEqualTo(10); - assertThat(stringField.getFieldName()).isEqualTo("string_field"); - assertThat(stringField.getFieldComment()).isEqualTo("string comment"); - assertThat(stringField.getFieldObjectInspector()) - .isEqualTo(getPrimitiveObjectInspector(String.class)); - - // timestamp without tz - StructField timestampField = soi.getStructFieldRef("timestamp_field"); - assertThat(timestampField.getFieldID()).isEqualTo(11); - assertThat(timestampField.getFieldName()).isEqualTo("timestamp_field"); - assertThat(timestampField.getFieldComment()).isEqualTo("timestamp comment"); - if (HiveVersion.min(HiveVersion.HIVE_3)) { - assertThat(timestampField.getFieldObjectInspector().getClass().getSimpleName()) - .isEqualTo("IcebergTimestampObjectInspectorHive3"); - } else { - assertThat(timestampField.getFieldObjectInspector()) - .isEqualTo(IcebergTimestampObjectInspector.get()); - } - - // timestamp with tz - StructField timestampTzField = soi.getStructFieldRef("timestamptz_field"); - assertThat(timestampTzField.getFieldID()).isEqualTo(12); - assertThat(timestampTzField.getFieldName()).isEqualTo("timestamptz_field"); - assertThat(timestampTzField.getFieldComment()).isEqualTo("timestamptz comment"); - if (HiveVersion.min(HiveVersion.HIVE_3)) { - assertThat(timestampTzField.getFieldObjectInspector().getClass().getSimpleName()) - .isEqualTo("IcebergTimestampWithZoneObjectInspectorHive3"); - } else { - assertThat(timestampTzField.getFieldObjectInspector()) - .isEqualTo(IcebergTimestampWithZoneObjectInspector.get()); - } - - // UUID - StructField uuidField = soi.getStructFieldRef("uuid_field"); - assertThat(uuidField.getFieldID()).isEqualTo(13); - assertThat(uuidField.getFieldName()).isEqualTo("uuid_field"); - assertThat(uuidField.getFieldComment()).isEqualTo("uuid comment"); - assertThat(uuidField.getFieldObjectInspector()).isEqualTo(IcebergUUIDObjectInspector.get()); - - // list - StructField listField = soi.getStructFieldRef("list_field"); - assertThat(listField.getFieldID()).isEqualTo(14); - assertThat(listField.getFieldName()).isEqualTo("list_field"); - assertThat(listField.getFieldComment()).isEqualTo("list comment"); - assertThat(listField.getFieldObjectInspector()).isEqualTo(getListObjectInspector(String.class)); - - // map - StructField mapField = soi.getStructFieldRef("map_field"); - assertThat(mapField.getFieldID()).isEqualTo(16); - assertThat(mapField.getFieldName()).isEqualTo("map_field"); - assertThat(mapField.getFieldComment()).isEqualTo("map comment"); - assertThat(mapField.getFieldObjectInspector()) - .isEqualTo(getMapObjectInspector(String.class, int.class)); - - // struct - StructField structField = soi.getStructFieldRef("struct_field"); - assertThat(structField.getFieldID()).isEqualTo(19); - assertThat(structField.getFieldName()).isEqualTo("struct_field"); - assertThat(structField.getFieldComment()).isEqualTo("struct comment"); - - ObjectInspector expectedObjectInspector = - new IcebergRecordObjectInspector( - (Types.StructType) schema.findType(19), - ImmutableList.of(getPrimitiveObjectInspector(String.class))); - assertThat(structField.getFieldObjectInspector()).isEqualTo(expectedObjectInspector); - - // time - StructField timeField = soi.getStructFieldRef("time_field"); - assertThat(timeField.getFieldID()).isEqualTo(21); - assertThat(timeField.getFieldName()).isEqualTo("time_field"); - assertThat(timeField.getFieldComment()).isEqualTo("time comment"); - assertThat(timeField.getFieldObjectInspector()).isEqualTo(IcebergTimeObjectInspector.get()); - } - - private static ObjectInspector getPrimitiveObjectInspector(Class clazz) { - PrimitiveTypeInfo typeInfo = - (PrimitiveTypeInfo) TypeInfoFactory.getPrimitiveTypeInfoFromJavaPrimitive(clazz); - return PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(typeInfo); - } - - private static ObjectInspector getListObjectInspector(Class clazz) { - return ObjectInspectorFactory.getStandardListObjectInspector( - getPrimitiveObjectInspector(clazz)); - } - - private static ObjectInspector getMapObjectInspector(Class keyClazz, Class valueClazz) { - return ObjectInspectorFactory.getStandardMapObjectInspector( - getPrimitiveObjectInspector(keyClazz), getPrimitiveObjectInspector(valueClazz)); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java deleted file mode 100644 index d5824f8bd7d8..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestIcebergRecordObjectInspector { - - @Test - public void testIcebergRecordObjectInspector() { - Schema schema = - new Schema( - required(1, "integer_field", Types.IntegerType.get()), - required( - 2, - "struct_field", - Types.StructType.of( - Types.NestedField.required(3, "string_field", Types.StringType.get())))); - - Record record = RandomGenericData.generate(schema, 1, 0L).get(0); - Record innerRecord = record.get(1, Record.class); - - StructObjectInspector soi = (StructObjectInspector) IcebergObjectInspector.create(schema); - assertThat(soi.getStructFieldsDataAsList(record)) - .isEqualTo(ImmutableList.of(record.get(0), record.get(1))); - - StructField integerField = soi.getStructFieldRef("integer_field"); - assertThat(soi.getStructFieldData(record, integerField)).isEqualTo(record.get(0)); - - StructField structField = soi.getStructFieldRef("struct_field"); - Object innerData = soi.getStructFieldData(record, structField); - assertThat(innerData).isEqualTo(innerRecord); - - StructObjectInspector innerSoi = (StructObjectInspector) structField.getFieldObjectInspector(); - StructField stringField = innerSoi.getStructFieldRef("string_field"); - - assertThat(innerSoi.getStructFieldsDataAsList(innerRecord)) - .isEqualTo(ImmutableList.of(innerRecord.get(0))); - assertThat(innerSoi.getStructFieldData(innerData, stringField)).isEqualTo(innerRecord.get(0)); - } - - @Test - public void testIcebergRecordObjectInspectorWithRowNull() { - Schema schema = - new Schema( - required(1, "integer_field", Types.IntegerType.get()), - required( - 2, - "struct_field", - Types.StructType.of( - Types.NestedField.required(3, "string_field", Types.StringType.get())))); - StructObjectInspector soi = (StructObjectInspector) IcebergObjectInspector.create(schema); - assertThat(soi.getStructFieldsDataAsList(null)).isNull(); - StructField integerField = soi.getStructFieldRef("integer_field"); - assertThat(soi.getStructFieldData(null, integerField)).isNull(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java deleted file mode 100644 index 5af9ba341ebd..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.time.LocalTime; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.Text; -import org.junit.jupiter.api.Test; - -public class TestIcebergTimeObjectInspector { - - @Test - public void testIcebergTimeObjectInspector() { - - IcebergTimeObjectInspector oi = IcebergTimeObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.STRING); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.stringTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.stringTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(String.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(Text.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - assertThat(oi.convert(null)).isNull(); - - LocalTime localTime = LocalTime.now(); - String time = localTime.toString(); - Text text = new Text(time); - - assertThat(oi.getPrimitiveJavaObject(text)).isEqualTo(time); - assertThat(oi.getPrimitiveWritableObject(time)).isEqualTo(text); - assertThat(oi.convert(time)).isEqualTo(localTime); - - Text copy = (Text) oi.copyObject(text); - - assertThat(copy).isEqualTo(text); - assertThat(copy).isNotSameAs(text); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java deleted file mode 100644 index ea40cc20420a..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.sql.Timestamp; -import java.time.LocalDateTime; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.junit.jupiter.api.Test; - -public class TestIcebergTimestampObjectInspector { - - @Test - public void testIcebergTimestampObjectInspector() { - IcebergTimestampObjectInspector oi = IcebergTimestampObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.timestampTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.timestampTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(Timestamp.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(TimestampWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - assertThat(oi.convert(null)).isNull(); - - LocalDateTime local = LocalDateTime.of(2020, 1, 1, 12, 55, 30, 5560000); - Timestamp ts = Timestamp.valueOf(local); - - assertThat(oi.getPrimitiveJavaObject(local)).isEqualTo(ts); - assertThat(oi.getPrimitiveWritableObject(local)).isEqualTo(new TimestampWritable(ts)); - - Timestamp copy = (Timestamp) oi.copyObject(ts); - - assertThat(copy).isEqualTo(ts); - assertThat(copy).isNotSameAs(ts); - - assertThat(oi.preferWritable()).isFalse(); - - assertThat(oi.convert(ts)).isEqualTo(local); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java deleted file mode 100644 index 1b16e6e02c0e..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.sql.Timestamp; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.junit.jupiter.api.Test; - -public class TestIcebergTimestampWithZoneObjectInspector { - - @Test - public void testIcebergTimestampObjectInspectorWithUTCAdjustment() { - IcebergTimestampWithZoneObjectInspector oi = IcebergTimestampWithZoneObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.timestampTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.timestampTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(Timestamp.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(TimestampWritable.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - assertThat(oi.convert(null)).isNull(); - - LocalDateTime local = LocalDateTime.of(2020, 1, 1, 16, 45, 33, 456000); - OffsetDateTime offsetDateTime = OffsetDateTime.of(local, ZoneOffset.ofHours(-5)); - Timestamp ts = Timestamp.from(offsetDateTime.toInstant()); - - assertThat(oi.getPrimitiveJavaObject(offsetDateTime)).isEqualTo(ts); - assertThat(oi.getPrimitiveWritableObject(offsetDateTime)).isEqualTo(new TimestampWritable(ts)); - - Timestamp copy = (Timestamp) oi.copyObject(ts); - - assertThat(copy).isEqualTo(ts); - assertThat(copy).isNotSameAs(ts); - - assertThat(oi.preferWritable()).isFalse(); - - assertThat(oi.convert(ts)) - .isEqualTo( - OffsetDateTime.ofInstant(local.toInstant(ZoneOffset.ofHours(-5)), ZoneOffset.UTC)); - - assertThat(offsetDateTime.withOffsetSameInstant(ZoneOffset.UTC)) - .isEqualTo(oi.convert(Timestamp.from(offsetDateTime.toInstant()))); - } -} diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java deleted file mode 100644 index abc0c01ed8d1..000000000000 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.mr.hive.serde.objectinspector; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.UUID; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.Text; -import org.junit.jupiter.api.Test; - -public class TestIcebergUUIDObjectInspector { - - @Test - public void testIcebergUUIDObjectInspector() { - IcebergUUIDObjectInspector oi = IcebergUUIDObjectInspector.get(); - - assertThat(oi.getCategory()).isEqualTo(ObjectInspector.Category.PRIMITIVE); - assertThat(oi.getPrimitiveCategory()) - .isEqualTo(PrimitiveObjectInspector.PrimitiveCategory.STRING); - - assertThat(oi.getTypeInfo()).isEqualTo(TypeInfoFactory.stringTypeInfo); - assertThat(oi.getTypeName()).isEqualTo(TypeInfoFactory.stringTypeInfo.getTypeName()); - - assertThat(oi.getJavaPrimitiveClass()).isEqualTo(String.class); - assertThat(oi.getPrimitiveWritableClass()).isEqualTo(Text.class); - - assertThat(oi.copyObject(null)).isNull(); - assertThat(oi.getPrimitiveJavaObject(null)).isNull(); - assertThat(oi.getPrimitiveWritableObject(null)).isNull(); - assertThat(oi.convert(null)).isNull(); - - UUID uuid = UUID.randomUUID(); - String uuidStr = uuid.toString(); - Text text = new Text(uuidStr); - - assertThat(oi.getPrimitiveJavaObject(text)).isEqualTo(uuidStr); - assertThat(oi.getPrimitiveWritableObject(uuidStr)).isEqualTo(text); - assertThat(oi.convert(uuidStr)).isEqualTo(uuid); - - Text copy = (Text) oi.copyObject(text); - - assertThat(copy).isEqualTo(text); - assertThat(copy).isNotSameAs(text); - - assertThat(oi.preferWritable()).isFalse(); - } -} diff --git a/settings.gradle b/settings.gradle index 8c9ab55116aa..f40794833201 100644 --- a/settings.gradle +++ b/settings.gradle @@ -41,6 +41,7 @@ include 'dell' include 'snowflake' include 'delta-lake' include 'open-api' +include 'mr' project(':bom').name = 'iceberg-bom' project(':api').name = 'iceberg-api' @@ -65,6 +66,7 @@ project(':dell').name = 'iceberg-dell' project(':snowflake').name = 'iceberg-snowflake' project(':delta-lake').name = 'iceberg-delta-lake' project(':open-api').name = 'iceberg-open-api' +project(':mr').name = 'iceberg-mr' if (null != System.getProperty("allModules")) { System.setProperty("flinkVersions", System.getProperty("knownFlinkVersions")) @@ -81,14 +83,6 @@ if (!knownFlinkVersions.containsAll(flinkVersions)) { throw new GradleException("Found unsupported Flink versions: " + (flinkVersions - knownFlinkVersions)) } -List knownHiveVersions = System.getProperty("knownHiveVersions").split(",") -String hiveVersionsString = System.getProperty("hiveVersions") != null ? System.getProperty("hiveVersions") : System.getProperty("defaultHiveVersions") -List hiveVersions = hiveVersionsString != null && !hiveVersionsString.isEmpty() ? hiveVersionsString.split(",") : [] - -if (!knownHiveVersions.containsAll(hiveVersions)) { - throw new GradleException("Found unsupported Hive versions: " + (hiveVersions - knownHiveVersions)) -} - List knownSparkVersions = System.getProperty("knownSparkVersions").split(",") String sparkVersionsString = System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions") List sparkVersions = sparkVersionsString != null && !sparkVersionsString.isEmpty() ? sparkVersionsString.split(",") : [] @@ -180,12 +174,6 @@ if (sparkVersions.contains("3.5")) { project(":iceberg-spark:spark-runtime-3.5_${scalaVersion}").name = "iceberg-spark-runtime-3.5_${scalaVersion}" } -if (hiveVersions.contains("2")) { - include 'mr' - - project(':mr').name = 'iceberg-mr' -} - if (kafkaVersions.contains("3")) { include 'kafka-connect' project(':kafka-connect').name = 'iceberg-kafka-connect'