From a43090a7ff0b6a55bea96859d2b3551ce4252e95 Mon Sep 17 00:00:00 2001 From: Smith Cruise Date: Mon, 4 Nov 2024 11:05:30 +0800 Subject: [PATCH 1/4] improve complex type Signed-off-by: Smith Cruise --- be/src/formats/parquet/column_reader.cpp | 27 ++++++++++--------- be/src/formats/parquet/group_reader.cpp | 1 - be/src/formats/parquet/meta_helper.cpp | 21 +++++++-------- be/src/formats/parquet/schema.cpp | 22 +++++----------- be/src/formats/parquet/schema.h | 33 +++++++++++++++++++++++- be/src/util/system_metrics.cpp | 2 +- 6 files changed, 62 insertions(+), 44 deletions(-) diff --git a/be/src/formats/parquet/column_reader.cpp b/be/src/formats/parquet/column_reader.cpp index e823b48dd72ae..a95b57c8c705c 100644 --- a/be/src/formats/parquet/column_reader.cpp +++ b/be/src/formats/parquet/column_reader.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include "common/compiler_util.h" #include "exec/exec_node.h" #include "exec/hdfs_scanner.h" -#include "exprs/expr_context.h" #include "formats/parquet/complex_column_reader.h" #include "formats/parquet/scalar_column_reader.h" #include "formats/utils.h" @@ -137,7 +135,7 @@ Status ColumnDictFilterContext::rewrite_conjunct_ctxs_to_predicate(StoredColumnR void ColumnReader::get_subfield_pos_with_pruned_type(const ParquetField& field, const TypeDescriptor& col_type, bool case_sensitive, std::vector& pos) { - DCHECK(field.type.type == LogicalType::TYPE_STRUCT); + DCHECK(field.type == ColumnType::STRUCT); if (!col_type.field_ids.empty()) { std::unordered_map field_id_2_pos; for (size_t i = 0; i < field.children.size(); i++) { @@ -246,12 +244,12 @@ bool ColumnReader::_has_valid_subfield_column_reader( Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, std::unique_ptr* output) { // We will only set a complex type in ParquetField - if ((field->type.is_complex_type() || col_type.is_complex_type()) && (field->type.type != col_type.type)) { + if (!field->has_same_complex_type(col_type)) { return Status::InternalError( strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, - logical_type_to_string(field->type.type), logical_type_to_string(col_type.type))); + column_type_to_string(field->type), logical_type_to_string(col_type.type))); } - if (field->type.type == LogicalType::TYPE_ARRAY) { + if (field->type == ColumnType::ARRAY) { std::unique_ptr child_reader; RETURN_IF_ERROR(ColumnReader::create(opts, &field->children[0], col_type.children[0], &child_reader)); if (child_reader != nullptr) { @@ -261,7 +259,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_MAP) { + } else if (field->type == ColumnType::MAP) { std::unique_ptr key_reader = nullptr; std::unique_ptr value_reader = nullptr; @@ -279,7 +277,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_STRUCT) { + } else if (field->type == ColumnType::STRUCT) { std::vector subfield_pos(col_type.children.size()); get_subfield_pos_with_pruned_type(*field, col_type, opts.case_sensitive, subfield_pos); @@ -316,12 +314,13 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, const TIcebergSchemaField* iceberg_schema_field, std::unique_ptr* output) { // We will only set a complex type in ParquetField - if ((field->type.is_complex_type() || col_type.is_complex_type()) && (field->type.type != col_type.type)) { - return Status::InternalError(strings::Substitute("ParquetField's type $0 is different from table's type $1", - field->type.type, col_type.type)); + if (!field->has_same_complex_type(col_type)) { + return Status::InternalError( + strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, + column_type_to_string(field->type), logical_type_to_string(col_type.type))); } DCHECK(iceberg_schema_field != nullptr); - if (field->type.type == LogicalType::TYPE_ARRAY) { + if (field->type == ColumnType::ARRAY) { std::unique_ptr child_reader; const TIcebergSchemaField* element_schema = &iceberg_schema_field->children[0]; RETURN_IF_ERROR( @@ -333,7 +332,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_MAP) { + } else if (field->type == ColumnType::MAP) { std::unique_ptr key_reader = nullptr; std::unique_ptr value_reader = nullptr; @@ -356,7 +355,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* } else { *output = nullptr; } - } else if (field->type.type == LogicalType::TYPE_STRUCT) { + } else if (field->type == ColumnType::STRUCT) { std::vector subfield_pos(col_type.children.size()); std::vector iceberg_schema_subfield(col_type.children.size()); get_subfield_pos_with_pruned_type(*field, col_type, opts.case_sensitive, iceberg_schema_field, subfield_pos, diff --git a/be/src/formats/parquet/group_reader.cpp b/be/src/formats/parquet/group_reader.cpp index cb0ef173bbba1..1140e12d7d552 100644 --- a/be/src/formats/parquet/group_reader.cpp +++ b/be/src/formats/parquet/group_reader.cpp @@ -27,7 +27,6 @@ #include "exec/hdfs_scanner.h" #include "exprs/expr.h" #include "exprs/expr_context.h" -#include "exprs/function_context.h" #include "formats/parquet/metadata.h" #include "formats/parquet/page_index_reader.h" #include "formats/parquet/schema.h" diff --git a/be/src/formats/parquet/meta_helper.cpp b/be/src/formats/parquet/meta_helper.cpp index e08a1c242aa12..3aafd57d44cff 100644 --- a/be/src/formats/parquet/meta_helper.cpp +++ b/be/src/formats/parquet/meta_helper.cpp @@ -58,7 +58,7 @@ void ParquetMetaHelper::build_column_name_2_pos_in_meta( // -- ColumnMetaData(path_in_schema=[col_tinyint]) // -- ColumnMetaData(path_in_schema=[col_struct, name]) // -- ColumnMetaData(path_in_schema=[col_struct, age]) - if (field->type.is_complex_type()) continue; + if (field->is_complex_type()) continue; // Put SlotDescriptor's origin column name here! column_name_2_pos_in_meta.emplace(slot->col_name(), field->physical_column_index); } @@ -102,25 +102,25 @@ bool ParquetMetaHelper::_is_valid_type(const ParquetField* parquet_field, const } // only check for complex type now // if complex type has none valid subfield, we will treat this struct type as invalid type. - if (!parquet_field->type.is_complex_type()) { + if (!parquet_field->is_complex_type()) { return true; } - if (parquet_field->type.type != type_descriptor->type) { - // complex type mismatched + // check the complex type is matched + if (!parquet_field->has_same_complex_type(*type_descriptor)) { return false; } bool has_valid_child = false; - if (parquet_field->type.is_array_type() || parquet_field->type.is_map_type()) { + if (parquet_field->type == ColumnType::ARRAY || parquet_field->type == ColumnType::MAP) { for (size_t idx = 0; idx < parquet_field->children.size(); idx++) { if (_is_valid_type(&parquet_field->children[idx], &type_descriptor->children[idx])) { has_valid_child = true; break; } } - } else if (parquet_field->type.is_struct_type()) { + } else if (parquet_field->type == ColumnType::STRUCT) { if (!type_descriptor->field_ids.empty()) { std::unordered_map field_id_2_type; for (size_t idx = 0; idx < type_descriptor->children.size(); idx++) { @@ -192,18 +192,17 @@ bool IcebergMetaHelper::_is_valid_type(const ParquetField* parquet_field, const const TypeDescriptor* type_descriptor) const { // only check for complex type now // if complex type has none valid subfield, we will treat this struct type as invalid type. - if (!parquet_field->type.is_complex_type()) { + if (!parquet_field->is_complex_type()) { return true; } - if (parquet_field->type.type != type_descriptor->type) { - // complex type mismatched + if (!_check_has_same_complex_type(parquet_field->type, type_descriptor->type)) { return false; } bool has_valid_child = false; - if (parquet_field->type.is_array_type() || parquet_field->type.is_map_type()) { + if (parquet_field->type == ColumnType::ARRAY || parquet_field->type == ColumnType::MAP) { for (size_t idx = 0; idx < parquet_field->children.size(); idx++) { if (_is_valid_type(&parquet_field->children[idx], &field_schema->children[idx], &type_descriptor->children[idx])) { @@ -211,7 +210,7 @@ bool IcebergMetaHelper::_is_valid_type(const ParquetField* parquet_field, const break; } } - } else if (parquet_field->type.is_struct_type()) { + } else if (parquet_field->type == ColumnType::STRUCT) { std::unordered_map field_id_2_iceberg_schema{}; std::unordered_map field_id_2_type{}; for (const auto& field : field_schema->children) { diff --git a/be/src/formats/parquet/schema.cpp b/be/src/formats/parquet/schema.cpp index 8311891ed23ab..d5b62b9ccbbd9 100644 --- a/be/src/formats/parquet/schema.cpp +++ b/be/src/formats/parquet/schema.cpp @@ -15,7 +15,6 @@ #include "formats/parquet/schema.h" #include -#include #include #include #include @@ -35,7 +34,7 @@ std::string LevelInfo::debug_string() const { std::string ParquetField::debug_string() const { std::stringstream ss; - ss << "ParquetField(name=" << name << ",type=" << type.type << ",physical_type=" << physical_type + ss << "ParquetField(name=" << name << ",type=" << type << ",physical_type=" << physical_type << ",physical_column_index=" << physical_column_index << ",levels_info=" << level_info.debug_string(); if (children.size() > 0) { ss << ",children=["; @@ -164,8 +163,7 @@ Status SchemaDescriptor::list_to_field(const std::vectorname = group_schema->name; field->field_id = group_schema->field_id; - field->type.type = TYPE_ARRAY; - field->type.children.push_back(field->children[0].type); + field->type = ColumnType::ARRAY; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -237,9 +235,7 @@ Status SchemaDescriptor::map_to_field(const std::vector field->name = group_schema->name; // Actually, we don't need to put field_id here field->field_id = group_schema->field_id; - field->type.type = TYPE_MAP; - field->type.children.emplace_back(key_field->type); - field->type.children.emplace_back(value_field->type); + field->type = ColumnType::MAP; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -262,13 +258,7 @@ Status SchemaDescriptor::group_to_struct_field(const std::vectorname = group_schema->name; field->is_nullable = is_optional(group_schema); field->level_info = cur_level_info; - field->type.type = TYPE_STRUCT; - for (size_t i = 0; i < num_children; i++) { - field->type.children.emplace_back(field->children[i].type); - } - for (size_t i = 0; i < num_children; i++) { - field->type.field_names.emplace_back(field->children[i].name); - } + field->type = ColumnType::STRUCT; field->field_id = group_schema->field_id; return Status::OK(); } @@ -295,7 +285,7 @@ Status SchemaDescriptor::group_to_field(const std::vectorchildren[0], next_pos)); field->name = group_schema->name; - field->type.type = TYPE_ARRAY; + field->type = ColumnType::ARRAY; field->is_nullable = false; field->level_info = cur_level_info; field->level_info.immediate_repeated_ancestor_def_level = last_immediate_repeated_ancestor_def_level; @@ -334,7 +324,7 @@ Status SchemaDescriptor::node_to_field(const std::vectorname = node_schema->name; - field->type.type = TYPE_ARRAY; + field->type = ColumnType::ARRAY; field->is_nullable = false; field->field_id = node_schema->field_id; field->level_info = cur_level_info; diff --git a/be/src/formats/parquet/schema.h b/be/src/formats/parquet/schema.h index 79f93f8075f52..77b53469a6eea 100644 --- a/be/src/formats/parquet/schema.h +++ b/be/src/formats/parquet/schema.h @@ -89,12 +89,29 @@ struct LevelInfo { std::string debug_string() const; }; +enum ColumnType { SCALAR = 0, ARRAY, MAP, STRUCT }; + +static std::string column_type_to_string(const ColumnType& column_type) { + switch (column_type) { + case SCALAR: + return "scalar"; + case ARRAY: + return "array"; + case MAP: + return "map"; + case STRUCT: + return "struct"; + default: + return "unknown"; + } +} + struct ParquetField { std::string name; tparquet::SchemaElement schema_element; // Used to identify if this field is a nested field. - TypeDescriptor type; + ColumnType type; bool is_nullable; // Only valid when this field is a leaf node @@ -118,6 +135,20 @@ struct ParquetField { int16_t max_def_level() const { return level_info.max_def_level; } int16_t max_rep_level() const { return level_info.max_rep_level; } std::string debug_string() const; + bool is_complex_type() const { return type == ARRAY || type == MAP || type == STRUCT; } + bool has_same_complex_type(const TypeDescriptor& type_descriptor) const { + // check the complex type is matched + if (type == ColumnType::ARRAY && type_descriptor.type == LogicalType::TYPE_ARRAY) { + return true; + } else if (type == ColumnType::MAP && type_descriptor.type == LogicalType::TYPE_MAP) { + return true; + } else if (type == ColumnType::STRUCT && type_descriptor.type == LogicalType::TYPE_STRUCT) { + return true; + } else { + // defense code + return false; + } + } }; class SchemaDescriptor { diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index ebbccac8beb39..556699410466a 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -288,10 +288,10 @@ void SystemMetrics::_install_memory_metrics(MetricRegistry* registry) { } void SystemMetrics::_update_memory_metrics() { - size_t value = 0; #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) LOG(INFO) << "Memory tracking is not available with address sanitizer builds."; #else + size_t value = 0; // Update the statistics cached by mallctl. uint64_t epoch = 1; size_t sz = sizeof(epoch); From a3b73b8520af6136e02334315db5c55f16fdfbaf Mon Sep 17 00:00:00 2001 From: Smith Cruise Date: Mon, 4 Nov 2024 13:18:51 +0800 Subject: [PATCH 2/4] improve complex type Signed-off-by: Smith Cruise --- be/src/formats/parquet/column_reader.cpp | 4 ++-- be/src/formats/parquet/meta_helper.cpp | 2 +- be/src/formats/parquet/schema.cpp | 3 ++- be/src/formats/parquet/schema.h | 3 +-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/be/src/formats/parquet/column_reader.cpp b/be/src/formats/parquet/column_reader.cpp index a95b57c8c705c..75e4b9098457c 100644 --- a/be/src/formats/parquet/column_reader.cpp +++ b/be/src/formats/parquet/column_reader.cpp @@ -244,7 +244,7 @@ bool ColumnReader::_has_valid_subfield_column_reader( Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, std::unique_ptr* output) { // We will only set a complex type in ParquetField - if (!field->has_same_complex_type(col_type)) { + if ((field->is_complex_type() || col_type.is_complex_type()) && !field->has_same_complex_type(col_type)) { return Status::InternalError( strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, column_type_to_string(field->type), logical_type_to_string(col_type.type))); @@ -314,7 +314,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* Status ColumnReader::create(const ColumnReaderOptions& opts, const ParquetField* field, const TypeDescriptor& col_type, const TIcebergSchemaField* iceberg_schema_field, std::unique_ptr* output) { // We will only set a complex type in ParquetField - if (!field->has_same_complex_type(col_type)) { + if ((field->is_complex_type() || col_type.is_complex_type()) && !field->has_same_complex_type(col_type)) { return Status::InternalError( strings::Substitute("ParquetField '$0' file's type $1 is different from table's type $2", field->name, column_type_to_string(field->type), logical_type_to_string(col_type.type))); diff --git a/be/src/formats/parquet/meta_helper.cpp b/be/src/formats/parquet/meta_helper.cpp index 3aafd57d44cff..c3c8f546695e9 100644 --- a/be/src/formats/parquet/meta_helper.cpp +++ b/be/src/formats/parquet/meta_helper.cpp @@ -196,7 +196,7 @@ bool IcebergMetaHelper::_is_valid_type(const ParquetField* parquet_field, const return true; } - if (!_check_has_same_complex_type(parquet_field->type, type_descriptor->type)) { + if (!parquet_field->has_same_complex_type(*type_descriptor)) { return false; } diff --git a/be/src/formats/parquet/schema.cpp b/be/src/formats/parquet/schema.cpp index d5b62b9ccbbd9..74eeb429bf085 100644 --- a/be/src/formats/parquet/schema.cpp +++ b/be/src/formats/parquet/schema.cpp @@ -34,7 +34,7 @@ std::string LevelInfo::debug_string() const { std::string ParquetField::debug_string() const { std::stringstream ss; - ss << "ParquetField(name=" << name << ",type=" << type << ",physical_type=" << physical_type + ss << "ParquetField(name=" << name << ",type=" << column_type_to_string(type) << ",physical_type=" << physical_type << ",physical_column_index=" << physical_column_index << ",levels_info=" << level_info.debug_string(); if (children.size() > 0) { ss << ",children=["; @@ -79,6 +79,7 @@ Status SchemaDescriptor::leaf_to_field(const tparquet::SchemaElement* t_schema, bool is_nullable, ParquetField* field) { field->name = t_schema->name; field->schema_element = *t_schema; + field->type = ColumnType::SCALAR; field->is_nullable = is_nullable; field->physical_type = t_schema->type; field->type_length = t_schema->type_length; diff --git a/be/src/formats/parquet/schema.h b/be/src/formats/parquet/schema.h index 77b53469a6eea..9571575dc7d56 100644 --- a/be/src/formats/parquet/schema.h +++ b/be/src/formats/parquet/schema.h @@ -91,7 +91,7 @@ struct LevelInfo { enum ColumnType { SCALAR = 0, ARRAY, MAP, STRUCT }; -static std::string column_type_to_string(const ColumnType& column_type) { +inline std::string column_type_to_string(const ColumnType& column_type) { switch (column_type) { case SCALAR: return "scalar"; @@ -145,7 +145,6 @@ struct ParquetField { } else if (type == ColumnType::STRUCT && type_descriptor.type == LogicalType::TYPE_STRUCT) { return true; } else { - // defense code return false; } } From 8be96cce210740439a1ba27210e15069fab17875 Mon Sep 17 00:00:00 2001 From: Smith Cruise Date: Mon, 4 Nov 2024 13:43:43 +0800 Subject: [PATCH 3/4] fix ut Signed-off-by: Smith Cruise --- be/test/formats/parquet/group_reader_test.cpp | 2 +- be/test/formats/parquet/parquet_cli_reader.h | 6 +- .../formats/parquet/parquet_schema_test.cpp | 65 +++++++++---------- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/be/test/formats/parquet/group_reader_test.cpp b/be/test/formats/parquet/group_reader_test.cpp index 16c87ee6f18c9..6bacc5450900d 100644 --- a/be/test/formats/parquet/group_reader_test.cpp +++ b/be/test/formats/parquet/group_reader_test.cpp @@ -442,7 +442,7 @@ TEST_F(GroupReaderTest, TestGetNext) { TEST_F(GroupReaderTest, ColumnReaderCreateTypeMismatch) { ParquetField field; field.name = "col0"; - field.type.type = LogicalType::TYPE_ARRAY; + field.type = ColumnType::ARRAY; TypeDescriptor col_type; col_type.type = LogicalType::TYPE_VARCHAR; diff --git a/be/test/formats/parquet/parquet_cli_reader.h b/be/test/formats/parquet/parquet_cli_reader.h index be3a5ad51fad4..c744befb0a5f4 100644 --- a/be/test/formats/parquet/parquet_cli_reader.h +++ b/be/test/formats/parquet/parquet_cli_reader.h @@ -113,20 +113,20 @@ class ParquetCLIReader { StatusOr _build_type(const ParquetField& field) { TypeDescriptor type; - if (field.type.type == TYPE_STRUCT) { + if (field.type == ColumnType::STRUCT) { type.type = TYPE_STRUCT; for (const auto& i : field.children) { ASSIGN_OR_RETURN(auto child_type, _build_type(i)); type.children.emplace_back(child_type); type.field_names.emplace_back(i.name); } - } else if (field.type.type == TYPE_MAP) { + } else if (field.type == ColumnType::MAP) { type.type = TYPE_MAP; for (const auto& i : field.children) { ASSIGN_OR_RETURN(auto child_type, _build_type(i)); type.children.emplace_back(child_type); } - } else if (field.type.type == TYPE_ARRAY) { + } else if (field.type == ColumnType::ARRAY) { type.type = TYPE_ARRAY; ASSIGN_OR_RETURN(auto child_type, _build_type(field.children[0])); type.children.emplace_back(child_type); diff --git a/be/test/formats/parquet/parquet_schema_test.cpp b/be/test/formats/parquet/parquet_schema_test.cpp index 1c751680d33e0..529d3ebfafe4e 100644 --- a/be/test/formats/parquet/parquet_schema_test.cpp +++ b/be/test/formats/parquet/parquet_schema_test.cpp @@ -75,12 +75,12 @@ class GroupNode { element.__set_num_children(num_children); return element; } - static ParquetField make_field(const std::string& name, bool is_nullable, LogicalType type, + static ParquetField make_field(const std::string& name, bool is_nullable, ColumnType type, std::vector children) { ParquetField field; field.name = name; field.is_nullable = is_nullable; - field.type.type = type; + field.type = type; field.children = std::move(children); return field; } @@ -148,7 +148,7 @@ class ParquetSchemaTest : public testing::Test { // Is group node ASSERT_EQ(expected[i].name, actual[i].name); ASSERT_EQ(expected[i].is_nullable, actual[i].is_nullable); - ASSERT_EQ(expected[i].type.type, actual[i].type.type); + ASSERT_EQ(expected[i].type, actual[i].type); } else { // is primitive node ASSERT_EQ(expected[i].name, actual[i].name); @@ -324,7 +324,7 @@ TEST_F(ParquetSchemaTest, NestedType) { // Check col2 { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(0, field->level_info.immediate_repeated_ancestor_def_level); @@ -341,7 +341,7 @@ TEST_F(ParquetSchemaTest, NestedType) { // Check col3 { auto field = desc.get_stored_column_by_column_name("col3"); - ASSERT_EQ(TYPE_STRUCT, field->type.type); + ASSERT_EQ(ColumnType::STRUCT, field->type); ASSERT_EQ(1, field->max_def_level()); ASSERT_EQ(0, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -579,7 +579,7 @@ TEST_F(ParquetSchemaTest, SimpleArray) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(1, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(false, field->is_nullable); @@ -632,7 +632,7 @@ TEST_F(ParquetSchemaTest, TwoLevelArray) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_ARRAY, field->type.type); + ASSERT_EQ(ColumnType::ARRAY, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -701,7 +701,7 @@ TEST_F(ParquetSchemaTest, MapNormal) { ASSERT_TRUE(st.ok()); { auto field = desc.get_stored_column_by_column_name("col2"); - ASSERT_EQ(TYPE_MAP, field->type.type); + ASSERT_EQ(ColumnType::MAP, field->type); ASSERT_EQ(2, field->max_def_level()); ASSERT_EQ(1, field->max_rep_level()); ASSERT_EQ(true, field->is_nullable); @@ -1045,7 +1045,7 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_map", false, LogicalType::TYPE_MAP, + GroupNode::make_field("my_map", false, ColumnType::MAP, {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY), PrimitiveNode::make_field("value", true, Type::type::BYTE_ARRAY)})); } @@ -1057,9 +1057,8 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { t_schemas.emplace_back(PrimitiveNode::make("key", FieldRepetitionType::type::REQUIRED, Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); - expected_fields.emplace_back( - GroupNode::make_field("my_set", false, LogicalType::TYPE_ARRAY, - {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY)})); + expected_fields.emplace_back(GroupNode::make_field( + "my_set", false, ColumnType::ARRAY, {PrimitiveNode::make_field("key", false, Type::type::BYTE_ARRAY)})); } // Two column map with non-standard field names. { @@ -1072,7 +1071,7 @@ TEST_F(ParquetSchemaTest, ParquetMaps) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("items", false, LogicalType::TYPE_MAP, + GroupNode::make_field("items", false, ColumnType::MAP, {PrimitiveNode::make_field("int_key", false, Type::type::INT32), PrimitiveNode::make_field("str_value", true, Type::type::BYTE_ARRAY)})); } @@ -1104,7 +1103,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_1", false, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_1", false, ColumnType::ARRAY, {PrimitiveNode::make_field("string", true, Type::type::BYTE_ARRAY)})); } @@ -1122,7 +1121,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { Type::type::BYTE_ARRAY, ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_2", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_2", true, ColumnType::ARRAY, {PrimitiveNode::make_field("string", false, Type::type::BYTE_ARRAY)})); } @@ -1148,8 +1147,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back(GroupNode::make_field( - "array_of_arrays", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("element", false, LogicalType::TYPE_ARRAY, + "array_of_arrays", true, ColumnType::ARRAY, + {GroupNode::make_field("element", false, ColumnType::ARRAY, {PrimitiveNode::make_field("int32", false, Type::type::INT32)})})); } @@ -1167,7 +1166,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back( - GroupNode::make_field("my_list_3", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_3", true, ColumnType::ARRAY, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})); } @@ -1182,7 +1181,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back( - GroupNode::make_field("my_list_4", true, LogicalType::TYPE_ARRAY, + GroupNode::make_field("my_list_4", true, ColumnType::ARRAY, {PrimitiveNode::make_field("element", false, Type::type::INT32)})); } @@ -1203,8 +1202,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::INT_32)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_5", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("element", false, LogicalType::TYPE_STRUCT, + "my_list_5", true, ColumnType::ARRAY, + {GroupNode::make_field("element", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY), PrimitiveNode::make_field("num", false, Type::type::INT32)})})); } @@ -1224,8 +1223,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_6", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("array", false, LogicalType::TYPE_STRUCT, + "my_list_6", true, ColumnType::ARRAY, + {GroupNode::make_field("array", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})})); } @@ -1244,8 +1243,8 @@ TEST_F(ParquetSchemaTest, ParquetLists) { ConvertedType::type::UTF8)); expected_fields.emplace_back(GroupNode::make_field( - "my_list_7", true, LogicalType::TYPE_ARRAY, - {GroupNode::make_field("my_list_tuple", false, LogicalType::TYPE_STRUCT, + "my_list_7", true, ColumnType::ARRAY, + {GroupNode::make_field("my_list_tuple", false, ColumnType::STRUCT, {PrimitiveNode::make_field("str", false, Type::type::BYTE_ARRAY)})})); } @@ -1254,7 +1253,7 @@ TEST_F(ParquetSchemaTest, ParquetLists) { { t_schemas.emplace_back(PrimitiveNode::make("name", FieldRepetitionType::REPEATED, Type::type::INT32)); expected_fields.emplace_back(GroupNode::make_field( - "name", false, LogicalType::TYPE_ARRAY, {PrimitiveNode::make_field("name", false, Type::type::INT32)})); + "name", false, ColumnType::ARRAY, {PrimitiveNode::make_field("name", false, Type::type::INT32)})); } SchemaDescriptor desc; @@ -1281,7 +1280,7 @@ TEST_F(ParquetSchemaTest, ParquetNestedSchema) { t_schemas.emplace_back(PrimitiveNode::make("leaf3", FieldRepetitionType::type::REQUIRED, Type::type::INT64)); expected_fields.emplace_back( - GroupNode::make_field("group1", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group1", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf1", false, Type::type::BOOLEAN), PrimitiveNode::make_field("leaf2", false, Type::type::INT32)})); expected_fields.emplace_back(PrimitiveNode::make_field("leaf3", false, Type::type::INT64)); @@ -1320,11 +1319,11 @@ TEST_F(ParquetSchemaTest, ParquetNestedSchema2) { t_schemas.emplace_back(PrimitiveNode::make("leaf5", FieldRepetitionType::type::REQUIRED, Type::type::INT64)); expected_fields.emplace_back( - GroupNode::make_field("group1", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group1", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf1", false, Type::type::INT64), PrimitiveNode::make_field("leaf2", false, Type::type::INT64)})); expected_fields.emplace_back( - GroupNode::make_field("group2", false, LogicalType::TYPE_STRUCT, + GroupNode::make_field("group2", false, ColumnType::STRUCT, {PrimitiveNode::make_field("leaf3", false, Type::type::INT64), PrimitiveNode::make_field("leaf4", false, Type::type::INT64)})); expected_fields.emplace_back(PrimitiveNode::make_field("leaf5", false, Type::type::INT64)); @@ -1359,11 +1358,11 @@ TEST_F(ParquetSchemaTest, ParquetRepeatedNestedSchema) { auto leaf2_field = PrimitiveNode::make_field("leaf2", true, Type::type::INT32); auto leaf3_field = PrimitiveNode::make_field("leaf3", true, Type::type::INT32); - auto inner_group_struct = GroupNode::make_field("innerGroup", false, LogicalType::TYPE_STRUCT, {leaf3_field}); - auto inner_group = GroupNode::make_field("innerGroup", false, LogicalType::TYPE_ARRAY, {inner_group_struct}); + auto inner_group_struct = GroupNode::make_field("innerGroup", false, ColumnType::STRUCT, {leaf3_field}); + auto inner_group = GroupNode::make_field("innerGroup", false, ColumnType::ARRAY, {inner_group_struct}); auto outer_group_struct = - GroupNode::make_field("outerGroup", false, LogicalType::TYPE_STRUCT, {leaf2_field, inner_group}); - auto outer_group = GroupNode::make_field("outerGroup", false, LogicalType::TYPE_ARRAY, {outer_group_struct}); + GroupNode::make_field("outerGroup", false, ColumnType::STRUCT, {leaf2_field, inner_group}); + auto outer_group = GroupNode::make_field("outerGroup", false, ColumnType::ARRAY, {outer_group_struct}); expected_fields.emplace_back(outer_group); } From 63bf0bcb894715904da1aa4c11227e7c9abeeaea Mon Sep 17 00:00:00 2001 From: Smith Cruise Date: Mon, 4 Nov 2024 23:40:58 +0800 Subject: [PATCH 4/4] improve Signed-off-by: Smith Cruise --- be/src/formats/parquet/schema.cpp | 32 +++++++++++++++++++++++++++++++ be/src/formats/parquet/schema.h | 30 +++-------------------------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/be/src/formats/parquet/schema.cpp b/be/src/formats/parquet/schema.cpp index 74eeb429bf085..80c7a80450dc7 100644 --- a/be/src/formats/parquet/schema.cpp +++ b/be/src/formats/parquet/schema.cpp @@ -25,6 +25,21 @@ namespace starrocks::parquet { +std::string column_type_to_string(const ColumnType& column_type) { + switch (column_type) { + case SCALAR: + return "scalar"; + case ARRAY: + return "array"; + case MAP: + return "map"; + case STRUCT: + return "struct"; + default: + return "unknown"; + } +} + std::string LevelInfo::debug_string() const { std::stringstream ss; ss << "LevelInfo(max_def_level=" << max_def_level << ",max_rep_level=" << max_rep_level @@ -50,6 +65,23 @@ std::string ParquetField::debug_string() const { return ss.str(); } +bool ParquetField::is_complex_type() const { + return type == ARRAY || type == MAP || type == STRUCT; +} + +bool ParquetField::has_same_complex_type(const TypeDescriptor& type_descriptor) const { + // check the complex type is matched + if (type == ColumnType::ARRAY && type_descriptor.type == LogicalType::TYPE_ARRAY) { + return true; + } else if (type == ColumnType::MAP && type_descriptor.type == LogicalType::TYPE_MAP) { + return true; + } else if (type == ColumnType::STRUCT && type_descriptor.type == LogicalType::TYPE_STRUCT) { + return true; + } else { + return false; + } +} + static bool is_group(const tparquet::SchemaElement* schema) { return schema->num_children > 0; } diff --git a/be/src/formats/parquet/schema.h b/be/src/formats/parquet/schema.h index 9571575dc7d56..faa9ea07e30b6 100644 --- a/be/src/formats/parquet/schema.h +++ b/be/src/formats/parquet/schema.h @@ -91,20 +91,7 @@ struct LevelInfo { enum ColumnType { SCALAR = 0, ARRAY, MAP, STRUCT }; -inline std::string column_type_to_string(const ColumnType& column_type) { - switch (column_type) { - case SCALAR: - return "scalar"; - case ARRAY: - return "array"; - case MAP: - return "map"; - case STRUCT: - return "struct"; - default: - return "unknown"; - } -} +std::string column_type_to_string(const ColumnType& column_type); struct ParquetField { std::string name; @@ -135,19 +122,8 @@ struct ParquetField { int16_t max_def_level() const { return level_info.max_def_level; } int16_t max_rep_level() const { return level_info.max_rep_level; } std::string debug_string() const; - bool is_complex_type() const { return type == ARRAY || type == MAP || type == STRUCT; } - bool has_same_complex_type(const TypeDescriptor& type_descriptor) const { - // check the complex type is matched - if (type == ColumnType::ARRAY && type_descriptor.type == LogicalType::TYPE_ARRAY) { - return true; - } else if (type == ColumnType::MAP && type_descriptor.type == LogicalType::TYPE_MAP) { - return true; - } else if (type == ColumnType::STRUCT && type_descriptor.type == LogicalType::TYPE_STRUCT) { - return true; - } else { - return false; - } - } + bool is_complex_type() const; + bool has_same_complex_type(const TypeDescriptor& type_descriptor) const; }; class SchemaDescriptor {