diff --git a/be/src/formats/parquet/column_reader.cpp b/be/src/formats/parquet/column_reader.cpp index 4e65b09f10361..4c3199698c63c 100644 --- a/be/src/formats/parquet/column_reader.cpp +++ b/be/src/formats/parquet/column_reader.cpp @@ -16,6 +16,7 @@ #include +<<<<<<< HEAD #include "column/array_column.h" #include "column/map_column.h" #include "column/struct_column.h" @@ -25,6 +26,26 @@ #include "formats/parquet/column_converter.h" #include "formats/parquet/stored_column_reader.h" #include "gutil/strings/substitute.h" +======= +#include +#include +#include +#include +#include +#include + +#include "column/chunk.h" +#include "column/column_helper.h" +#include "column/nullable_column.h" +#include "common/compiler_util.h" +#include "exec/exec_node.h" +#include "exec/hdfs_scanner.h" +#include "formats/parquet/complex_column_reader.h" +#include "formats/parquet/scalar_column_reader.h" +#include "formats/utils.h" +#include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/parquet_types.h" +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) #include "simd/batch_run_counter.h" #include "storage/column_or_predicate.h" #include "util/runtime_profile.h" @@ -1035,7 +1056,16 @@ class StructColumnReader : public ColumnReader { void ColumnReader::get_subfield_pos_with_pruned_type(const ParquetField& field, const TypeDescriptor& col_type, bool case_sensitive, std::vector& pos) { +<<<<<<< HEAD DCHECK(field.type.type == LogicalType::TYPE_STRUCT); +======= + DCHECK(field.type == ColumnType::STRUCT); + if (!col_type.field_ids.empty()) { + std::unordered_map field_id_2_pos; + for (size_t i = 0; i < field.children.size(); i++) { + field_id_2_pos.emplace(field.children[i].field_id, i); + } +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) // build tmp mapping for ParquetField std::unordered_map field_name_2_pos; @@ -1118,11 +1148,18 @@ bool ColumnReader::_has_valid_subfield_column_reader( Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, std::unique_ptr* output) { // We will only set a complex type in ParquetField +<<<<<<< HEAD if ((field->type.is_complex_type() || col_type.is_complex_type()) && (field->type.type != col_type.type)) { return Status::InternalError(strings::Substitute("ParquetField's type $0 is different from table's type $1", field->type.type, col_type.type)); +======= + if ((field->is_complex_type() || col_type.is_complex_type()) && !field->has_same_complex_type(col_type)) { + return Status::InternalError( + strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, + column_type_to_string(field->type), logical_type_to_string(col_type.type))); +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) } - if (field->type.type == LogicalType::TYPE_ARRAY) { + if (field->type == ColumnType::ARRAY) { std::unique_ptr child_reader; RETURN_IF_ERROR(ColumnReader::create(opts, &field->children[0], col_type.children[0], &child_reader)); if (child_reader != nullptr) { @@ -1132,7 +1169,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_MAP) { + } else if (field->type == ColumnType::MAP) { std::unique_ptr key_reader = nullptr; std::unique_ptr value_reader = nullptr; @@ -1150,7 +1187,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_STRUCT) { + } else if (field->type == ColumnType::STRUCT) { std::vector subfield_pos(col_type.children.size()); get_subfield_pos_with_pruned_type(*field, col_type, opts.case_sensitive, subfield_pos); @@ -1186,12 +1223,13 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, const TIcebergSchemaField* iceberg_schema_field, std::unique_ptr* output) { // We will only set a complex type in ParquetField - if ((field->type.is_complex_type() || col_type.is_complex_type()) && (field->type.type != col_type.type)) { - return Status::InternalError(strings::Substitute("ParquetField's type $0 is different from table's type $1", - field->type.type, col_type.type)); + if ((field->is_complex_type() || col_type.is_complex_type()) && !field->has_same_complex_type(col_type)) { + return Status::InternalError( + strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, + column_type_to_string(field->type), logical_type_to_string(col_type.type))); } DCHECK(iceberg_schema_field != nullptr); - if (field->type.type == LogicalType::TYPE_ARRAY) { + if (field->type == ColumnType::ARRAY) { std::unique_ptr child_reader; const TIcebergSchemaField* element_schema = &iceberg_schema_field->children[0]; RETURN_IF_ERROR( @@ -1203,7 +1241,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_MAP) { + } else if (field->type == ColumnType::MAP) { std::unique_ptr key_reader = nullptr; std::unique_ptr value_reader = nullptr; @@ -1226,7 +1264,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_STRUCT) { + } else if (field->type == ColumnType::STRUCT) { std::vector subfield_pos(col_type.children.size()); std::vector iceberg_schema_subfield(col_type.children.size()); get_subfield_pos_with_pruned_type(*field, col_type, opts.case_sensitive, iceberg_schema_field, subfield_pos, diff --git a/be/src/formats/parquet/group_reader.cpp b/be/src/formats/parquet/group_reader.cpp index 9dc7b323ce758..b91219f688767 100644 --- a/be/src/formats/parquet/group_reader.cpp +++ b/be/src/formats/parquet/group_reader.cpp @@ -23,6 +23,12 @@ #include "exec/hdfs_scanner.h" #include "exprs/expr.h" #include "exprs/expr_context.h" +<<<<<<< HEAD +======= +#include "formats/parquet/metadata.h" +#include "formats/parquet/page_index_reader.h" +#include "formats/parquet/schema.h" +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) #include "gutil/strings/substitute.h" #include "runtime/types.h" #include "simd/simd.h" diff --git a/be/src/formats/parquet/meta_helper.cpp b/be/src/formats/parquet/meta_helper.cpp index 1365b21e5c234..446748853c9ff 100644 --- a/be/src/formats/parquet/meta_helper.cpp +++ b/be/src/formats/parquet/meta_helper.cpp @@ -36,6 +36,34 @@ void ParquetMetaHelper::build_column_name_2_pos_in_meta( break; } } +<<<<<<< HEAD +======= + + // After the column is added, there is no new column when querying the previously + // imported parquet file. It is skipped here, and this column will be set to NULL + // in the FileReader::_read_min_max_chunk. + if (field == nullptr) continue; + // For field which type is complex, the filed physical_column_index in file meta is not same with the column index + // in row_group's column metas + // For example: + // table schema : + // -- col_tinyint tinyint + // -- col_struct struct + // ----- name string + // ----- age int + // file metadata schema : + // -- ParquetField(name=col_tinyint, physical_column_index=0) + // -- ParquetField(name=col_struct,physical_column_index=0, + // children=[ParquetField(name=name, physical_column_index=1), + // ParquetField(name=age, physical_column_index=2)]) + // row group column metas: + // -- ColumnMetaData(path_in_schema=[col_tinyint]) + // -- ColumnMetaData(path_in_schema=[col_struct, name]) + // -- ColumnMetaData(path_in_schema=[col_struct, age]) + if (field->is_complex_type()) continue; + // Put SlotDescriptor's origin column name here! + column_name_2_pos_in_meta.emplace(slot->col_name(), field->physical_column_index); +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) } } @@ -66,24 +94,25 @@ bool ParquetMetaHelper::_is_valid_type(const ParquetField* parquet_field, const } // only check for complex type now // if complex type has none valid subfield, we will treat this struct type as invalid type. - if (!parquet_field->type.is_complex_type()) { + if (!parquet_field->is_complex_type()) { return true; } - if (parquet_field->type.type != type_descriptor->type) { - // complex type mismatched + // check the complex type is matched + if (!parquet_field->has_same_complex_type(*type_descriptor)) { return false; } bool has_valid_child = false; - if (parquet_field->type.is_array_type() || parquet_field->type.is_map_type()) { + if (parquet_field->type == ColumnType::ARRAY || parquet_field->type == ColumnType::MAP) { for (size_t idx = 0; idx < parquet_field->children.size(); idx++) { if (_is_valid_type(&parquet_field->children[idx], &type_descriptor->children[idx])) { has_valid_child = true; break; } } +<<<<<<< HEAD } else if (parquet_field->type.is_struct_type()) { std::unordered_map field_name_2_type{}; for (size_t idx = 0; idx < type_descriptor->children.size(); idx++) { @@ -96,6 +125,13 @@ bool ParquetMetaHelper::_is_valid_type(const ParquetField* parquet_field, const auto it = field_name_2_type.find(Utils::format_name(child_parquet_field.name, _case_sensitive)); if (it == field_name_2_type.end()) { continue; +======= + } else if (parquet_field->type == ColumnType::STRUCT) { + if (!type_descriptor->field_ids.empty()) { + std::unordered_map field_id_2_type; + for (size_t idx = 0; idx < type_descriptor->children.size(); idx++) { + field_id_2_type.emplace(type_descriptor->field_ids[idx], &type_descriptor->children[idx]); +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) } if (_is_valid_type(&child_parquet_field, it->second)) { @@ -122,18 +158,17 @@ bool IcebergMetaHelper::_is_valid_type(const ParquetField* parquet_field, const const TypeDescriptor* type_descriptor) const { // only check for complex type now // if complex type has none valid subfield, we will treat this struct type as invalid type. - if (!parquet_field->type.is_complex_type()) { + if (!parquet_field->is_complex_type()) { return true; } - if (parquet_field->type.type != type_descriptor->type) { - // complex type mismatched + if (!parquet_field->has_same_complex_type(*type_descriptor)) { return false; } bool has_valid_child = false; - if (parquet_field->type.is_array_type() || parquet_field->type.is_map_type()) { + if (parquet_field->type == ColumnType::ARRAY || parquet_field->type == ColumnType::MAP) { for (size_t idx = 0; idx < parquet_field->children.size(); idx++) { if (_is_valid_type(&parquet_field->children[idx], &field_schema->children[idx], &type_descriptor->children[idx])) { @@ -141,7 +176,7 @@ bool IcebergMetaHelper::_is_valid_type(const ParquetField* parquet_field, const break; } } - } else if (parquet_field->type.is_struct_type()) { + } else if (parquet_field->type == ColumnType::STRUCT) { std::unordered_map field_id_2_iceberg_schema{}; std::unordered_map field_id_2_type{}; for (const auto& field : field_schema->children) { diff --git a/be/src/formats/parquet/schema.cpp b/be/src/formats/parquet/schema.cpp index 861e3866a2b67..5d39ae6a4f148 100644 --- a/be/src/formats/parquet/schema.cpp +++ b/be/src/formats/parquet/schema.cpp @@ -14,13 +14,35 @@ #include "formats/parquet/schema.h" +<<<<<<< HEAD #include +======= +#include +#include +#include +#include +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) #include "gutil/casts.h" #include "gutil/strings/substitute.h" namespace starrocks::parquet { +std::string column_type_to_string(const ColumnType& column_type) { + switch (column_type) { + case SCALAR: + return "scalar"; + case ARRAY: + return "array"; + case MAP: + return "map"; + case STRUCT: + return "struct"; + default: + return "unknown"; + } +} + std::string LevelInfo::debug_string() const { std::stringstream ss; ss << "LevelInfo(max_def_level=" << max_def_level << ",max_rep_level=" << max_rep_level @@ -30,7 +52,7 @@ std::string LevelInfo::debug_string() const { std::string ParquetField::debug_string() const { std::stringstream ss; - ss << "ParquetField(name=" << name << ",type=" << type.type << ",physical_type=" << physical_type + ss << "ParquetField(name=" << name << ",type=" << column_type_to_string(type) << ",physical_type=" << physical_type << ",physical_column_index=" << physical_column_index << ",levels_info=" << level_info.debug_string(); if (children.size() > 0) { ss << ",children=["; @@ -46,6 +68,23 @@ std::string ParquetField::debug_string() const { return ss.str(); } +bool ParquetField::is_complex_type() const { + return type == ARRAY || type == MAP || type == STRUCT; +} + +bool ParquetField::has_same_complex_type(const TypeDescriptor& type_descriptor) const { + // check the complex type is matched + if (type == ColumnType::ARRAY && type_descriptor.type == LogicalType::TYPE_ARRAY) { + return true; + } else if (type == ColumnType::MAP && type_descriptor.type == LogicalType::TYPE_MAP) { + return true; + } else if (type == ColumnType::STRUCT && type_descriptor.type == LogicalType::TYPE_STRUCT) { + return true; + } else { + return false; + } +} + static bool is_group(const tparquet::SchemaElement* schema) { return schema->num_children > 0; } @@ -75,6 +114,7 @@ Status SchemaDescriptor::leaf_to_field(const tparquet::SchemaElement* t_schema, bool is_nullable, ParquetField* field) { field->name = t_schema->name; field->schema_element = *t_schema; + field->type = ColumnType::SCALAR; field->is_nullable = is_nullable; field->physical_type = t_schema->type; field->type_length = t_schema->type_length; @@ -159,8 +199,7 @@ Status SchemaDescriptor::list_to_field(const std::vectorname = group_schema->name; field->field_id = group_schema->field_id; - field->type.type = TYPE_ARRAY; - field->type.children.push_back(field->children[0].type); + field->type = ColumnType::ARRAY; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -232,9 +271,7 @@ Status SchemaDescriptor::map_to_field(const std::vector field->name = group_schema->name; // Actually, we don't need to put field_id here field->field_id = group_schema->field_id; - field->type.type = TYPE_MAP; - field->type.children.emplace_back(key_field->type); - field->type.children.emplace_back(value_field->type); + field->type = ColumnType::MAP; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -257,13 +294,7 @@ Status SchemaDescriptor::group_to_struct_field(const std::vectorname = group_schema->name; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; - field->type.type = TYPE_STRUCT; - for (size_t i = 0; i < num_children; i++) { - field->type.children.emplace_back(field->children[i].type); - } - for (size_t i = 0; i < num_children; i++) { - field->type.field_names.emplace_back(field->children[i].name); - } + field->type = ColumnType::STRUCT; field->field_id = group_schema->field_id; return Status::OK(); } @@ -290,7 +321,7 @@ Status SchemaDescriptor::group_to_field(const std::vectorchildren[0], next_pos)); field->name = group_schema->name; - field->type.type = TYPE_ARRAY; + field->type = ColumnType::ARRAY; field->is_nullable = false; field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -329,7 +360,7 @@ Status SchemaDescriptor::node_to_field(const std::vectorname = node_schema->name; - field->type.type = TYPE_ARRAY; + field->type = ColumnType::ARRAY; field->is_nullable = false; field->field_id = node_schema->field_id; field->level_info = cur_level_info; diff --git a/be/src/formats/parquet/schema.h b/be/src/formats/parquet/schema.h index 005c580d58cd1..abbd19b35549c 100644 --- a/be/src/formats/parquet/schema.h +++ b/be/src/formats/parquet/schema.h @@ -85,12 +85,16 @@ struct LevelInfo { std::string debug_string() const; }; +enum ColumnType { SCALAR = 0, ARRAY, MAP, STRUCT }; + +std::string column_type_to_string(const ColumnType& column_type); + struct ParquetField { std::string name; tparquet::SchemaElement schema_element; // Used to identify if this field is a nested field. - TypeDescriptor type; + ColumnType type; bool is_nullable; // Only valid when this field is a leaf node @@ -114,6 +118,8 @@ struct ParquetField { int16_t max_def_level() const { return level_info.max_def_level; } int16_t max_rep_level() const { return level_info.max_rep_level; } std::string debug_string() const; + bool is_complex_type() const; + bool has_same_complex_type(const TypeDescriptor& type_descriptor) const; }; class SchemaDescriptor { diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index 0eaf2b3a6dc49..c3b66e681baf8 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -280,10 +280,10 @@ void SystemMetrics::_install_memory_metrics(MetricRegistry* registry) { } void SystemMetrics::_update_memory_metrics() { - size_t value = 0; #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) LOG(INFO) << "Memory tracking is not available with address sanitizer builds."; #else + size_t value = 0; // Update the statistics cached by mallctl. uint64_t epoch = 1; size_t sz = sizeof(epoch); diff --git a/be/test/formats/parquet/group_reader_test.cpp b/be/test/formats/parquet/group_reader_test.cpp index fb5bf2193cb18..758f26ce10bae 100644 --- a/be/test/formats/parquet/group_reader_test.cpp +++ b/be/test/formats/parquet/group_reader_test.cpp @@ -443,4 +443,21 @@ TEST_F(GroupReaderTest, TestGetNext) { _check_chunk(param, chunk, 8, 4); } +<<<<<<< HEAD +======= +TEST_F(GroupReaderTest, ColumnReaderCreateTypeMismatch) { + ParquetField field; + field.name = "col0"; + field.type = ColumnType::ARRAY; + + TypeDescriptor col_type; + col_type.type = LogicalType::TYPE_VARCHAR; + + ColumnReaderOptions options; + Status st = ColumnReader::create(options, &field, col_type, nullptr); + ASSERT_FALSE(st.ok()) << st; + std::cout << st.message() << "\n"; +} + +>>>>>>> 1028b6ac2c ([Enhancement] Use more clear type description in ParquetField (#52575)) } // namespace starrocks::parquet diff --git a/be/test/formats/parquet/parquet_cli_reader.h b/be/test/formats/parquet/parquet_cli_reader.h index 3fda666bcf1ed..59eb9e3376c5b 100644 --- a/be/test/formats/parquet/parquet_cli_reader.h +++ b/be/test/formats/parquet/parquet_cli_reader.h @@ -113,20 +113,20 @@ class ParquetCLIReader { StatusOr _build_type(const ParquetField& field) { TypeDescriptor type; - if (field.type.type == TYPE_STRUCT) { + if (field.type == ColumnType::STRUCT) { type.type = TYPE_STRUCT; for (const auto& i : field.children) { ASSIGN_OR_RETURN(auto child_type, _build_type(i)); type.children.emplace_back(child_type); type.field_names.emplace_back(i.name); } - } else if (field.type.type == TYPE_MAP) { + } else if (field.type == ColumnType::MAP) { type.type = TYPE_MAP; for (const auto& i : field.children) { ASSIGN_OR_RETURN(auto child_type, _build_type(i)); type.children.emplace_back(child_type); } - } else if (field.type.type == TYPE_ARRAY) { + } else if (field.type == ColumnType::ARRAY) { type.type = TYPE_ARRAY; ASSIGN_OR_RETURN(auto child_type, _build_type(field.children[0])); type.children.emplace_back(child_type); diff --git a/be/test/formats/parquet/parquet_schema_test.cpp b/be/test/formats/parquet/parquet_schema_test.cpp index 38acd16f16c98..aa5080f1ae44b 100644 --- a/be/test/formats/parquet/parquet_schema_test.cpp +++ b/be/test/formats/parquet/parquet_schema_test.cpp @@ -75,12 +75,12 @@ class GroupNode { element.__set_num_children(num_children); return element; } - static ParquetField make_field(const std::string& name, bool is_nullable, LogicalType type, + static ParquetField make_field(const std::string& name, bool is_nullable, ColumnType type, std::vector children) { ParquetField field; field.name = name; field.is_nullable = is_nullable; - field.type.type = type; + field.type = type; field.children = std::move(children); return field; } @@ -148,7 +148,7 @@ class ParquetSchemaTest : public testing::Test { // Is group node ASSERT_EQ(expected[i].name, actual[i].name); ASSERT_EQ(expected[i].is_nullable, actual[i].is_nullable); - ASSERT_EQ(expected[i].type.type, actual[i].type.type); + ASSERT_EQ(expected[i].type, actual[i].type); } else { // is primitive node ASSERT_EQ(expected[i].name, actual[i].name); @@ -324,7 +324,7 @@ TEST_F(ParquetSchemaTest, NestedType) { // Check col2 { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(0, field->level_info.immediate_repeated_ancestor_def_level); @@ -341,7 +341,7 @@ TEST_F(ParquetSchemaTest, NestedType) { // Check col3 { auto field = desc.get_stored_column_by_column_name("col3"); - ASSERT_EQ(TYPE_STRUCT, field->type.type); + ASSERT_EQ(ColumnType::STRUCT, field->type); ASSERT_EQ(1, field->max_def_level()); ASSERT_EQ(0, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -579,7 +579,7 @@ TEST_F(ParquetSchemaTest, SimpleArray) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(1, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(false, field->is_nullable); @@ -632,7 +632,7 @@ TEST_F(ParquetSchemaTest, TwoLevelArray) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -701,7 +701,7 @@ TEST_F(ParquetSchemaTest, MapNormal) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_MAP, field->type.type); + ASSERT_EQ(ColumnType::MAP, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -1045,7 +1045,7 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_map", false, LogicalType::TYPE_MAP, + GroupNode::make_field("my_map", false, ColumnType::MAP, {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY), PrimitiveNode::make_field("value", true, Type::type::BYTE_ARRAY)})); } @@ -1057,9 +1057,8 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { t_schemas.emplace_back(PrimitiveNode::make("key", FieldRepetitionType::type::REQUIRED, Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); - expected_fields.emplace_back( - GroupNode::make_field("my_set", false, LogicalType::TYPE_ARRAY, - {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY)})); + expected_fields.emplace_back(GroupNode::make_field( + "my_set", false, ColumnType::ARRAY, {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY)})); } // Two column map with non-standard field names. { @@ -1072,7 +1071,7 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("items", false, LogicalType::TYPE_MAP, + GroupNode::make_field("items", false, ColumnType::MAP, {PrimitiveNode::make_field("int_key", false, Type::type::INT32), PrimitiveNode::make_field("str_value", true, Type::type::BYTE_ARRAY)})); } @@ -1104,7 +1103,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_1", false, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_1", false, ColumnType::ARRAY, {PrimitiveNode::make_field("string", true, Type::type::BYTE_ARRAY)})); } @@ -1122,7 +1121,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_2", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_2", true, ColumnType::ARRAY, {PrimitiveNode::make_field("string", false, Type::type::BYTE_ARRAY)})); } @@ -1148,8 +1147,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back(GroupNode::make_field( - "array_of_arrays", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("element", false, LogicalType::TYPE_ARRAY, + "array_of_arrays", true, ColumnType::ARRAY, + {GroupNode::make_field("element", false, ColumnType::ARRAY, {PrimitiveNode::make_field("int32", false, Type::type::INT32)})})); } @@ -1167,7 +1166,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_3", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_3", true, ColumnType::ARRAY, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})); } @@ -1182,7 +1181,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back( - GroupNode::make_field("my_list_4", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_4", true, ColumnType::ARRAY, {PrimitiveNode::make_field("element", false, Type::type::INT32)})); } @@ -1203,8 +1202,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_5", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("element", false, LogicalType::TYPE_STRUCT, + "my_list_5", true, ColumnType::ARRAY, + {GroupNode::make_field("element", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY), PrimitiveNode::make_field("num", false, Type::type::INT32)})})); } @@ -1224,8 +1223,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_6", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("array", false, LogicalType::TYPE_STRUCT, + "my_list_6", true, ColumnType::ARRAY, + {GroupNode::make_field("array", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})})); } @@ -1244,8 +1243,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_7", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("my_list_tuple", false, LogicalType::TYPE_STRUCT, + "my_list_7", true, ColumnType::ARRAY, + {GroupNode::make_field("my_list_tuple", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})})); } @@ -1254,7 +1253,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { { t_schemas.emplace_back(PrimitiveNode::make("name", FieldRepetitionType::REPEATED, Type::type::INT32)); expected_fields.emplace_back(GroupNode::make_field( - "name", false, LogicalType::TYPE_ARRAY, {PrimitiveNode::make_field("name", false, Type::type::INT32)})); + "name", false, ColumnType::ARRAY, {PrimitiveNode::make_field("name", false, Type::type::INT32)})); } SchemaDescriptor desc; @@ -1281,7 +1280,7 @@ TEST_F(ParquetSchemaTest, ParquetNestedSchema) { t_schemas.emplace_back(PrimitiveNode::make("leaf3", FieldRepetitionType::type::REQUIRED, Type::type::INT64)); expected_fields.emplace_back( - GroupNode::make_field("group1", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group1", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf1", false, Type::type::BOOLEAN), PrimitiveNode::make_field("leaf2", false, Type::type::INT32)})); expected_fields.emplace_back(PrimitiveNode::make_field("leaf3", false, Type::type::INT64)); @@ -1320,11 +1319,11 @@ TEST_F(ParquetSchemaTest, ParquetNestedSchema2) { t_schemas.emplace_back(PrimitiveNode::make("leaf5", FieldRepetitionType::type::REQUIRED, Type::type::INT64)); expected_fields.emplace_back( - GroupNode::make_field("group1", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group1", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf1", false, Type::type::INT64), PrimitiveNode::make_field("leaf2", false, Type::type::INT64)})); expected_fields.emplace_back( - GroupNode::make_field("group2", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group2", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf3", false, Type::type::INT64), PrimitiveNode::make_field("leaf4", false, Type::type::INT64)})); expected_fields.emplace_back(PrimitiveNode::make_field("leaf5", false, Type::type::INT64)); @@ -1359,11 +1358,11 @@ TEST_F(ParquetSchemaTest, ParquetRepeatedNestedSchema) { auto leaf2_field = PrimitiveNode::make_field("leaf2", true, Type::type::INT32); auto leaf3_field = PrimitiveNode::make_field("leaf3", true, Type::type::INT32); - auto inner_group_struct = GroupNode::make_field("innerGroup", false, LogicalType::TYPE_STRUCT, {leaf3_field}); - auto inner_group = GroupNode::make_field("innerGroup", false, LogicalType::TYPE_ARRAY, {inner_group_struct}); + auto inner_group_struct = GroupNode::make_field("innerGroup", false, ColumnType::STRUCT, {leaf3_field}); + auto inner_group = GroupNode::make_field("innerGroup", false, ColumnType::ARRAY, {inner_group_struct}); auto outer_group_struct = - GroupNode::make_field("outerGroup", false, LogicalType::TYPE_STRUCT, {leaf2_field, inner_group}); - auto outer_group = GroupNode::make_field("outerGroup", false, LogicalType::TYPE_ARRAY, {outer_group_struct}); + GroupNode::make_field("outerGroup", false, ColumnType::STRUCT, {leaf2_field, inner_group}); + auto outer_group = GroupNode::make_field("outerGroup", false, ColumnType::ARRAY, {outer_group_struct}); expected_fields.emplace_back(outer_group); }