diff --git a/be/src/formats/parquet/file_reader.cpp b/be/src/formats/parquet/file_reader.cpp index 2d45e0c6dcb5f..f618e6b93b186 100644 --- a/be/src/formats/parquet/file_reader.cpp +++ b/be/src/formats/parquet/file_reader.cpp @@ -311,23 +311,35 @@ bool FileReader::_filter_group_with_more_filter(const GroupReaderPtr& group_read return true; } } else if (filter_type == StatisticsHelper::StatSupportedFilter::FILTER_IN) { + if (!column_meta->statistics.__isset.null_count) continue; + std::vector min_values; std::vector max_values; std::vector null_counts; + std::vector null_pages; + int64_t num_rows = group_reader->get_row_group_metadata()->num_rows; const ParquetField* field = group_reader->get_column_parquet_field(slot->id()); if (field == nullptr) { LOG(WARNING) << "Can't get " + slot->col_name() + "'s ParquetField in _read_min_max_chunk."; continue; } - auto st = StatisticsHelper::get_min_max_value(_file_metadata.get(), slot->type(), column_meta, - field, min_values, max_values); - if (!st.ok()) continue; - st = StatisticsHelper::get_null_counts(column_meta, null_counts); - if (!st.ok()) continue; + Status st; + + null_counts.emplace_back(column_meta->statistics.null_count); + null_pages.emplace_back(num_rows == column_meta->statistics.null_count); + if (num_rows == column_meta->statistics.null_count) { + min_values.emplace_back(""); + max_values.emplace_back(""); + } else { + st = StatisticsHelper::get_min_max_value(_file_metadata.get(), slot->type(), column_meta, field, + min_values, max_values); + if (!st.ok()) continue; + } + Filter selected(min_values.size(), 1); - st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctx, field, - _scanner_ctx->timezone, selected); + st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, + ctx, field, _scanner_ctx->timezone, selected); if (!st.ok()) continue; if (!selected[0]) { return true; @@ -403,11 +415,6 @@ Status FileReader::_read_has_nulls(const GroupReaderPtr& group_reader, const std // statistics not exist in parquet file return Status::Aborted("No exist statistics"); } else { - const ParquetField* field = group_reader->get_column_parquet_field(slot->id()); - if (field == nullptr) { - LOG(WARNING) << "Can't get " + slot->col_name() + "'s ParquetField in _read_has_nulls."; - return Status::InternalError(strings::Substitute("Can't get $0 field", slot->col_name())); - } RETURN_IF_ERROR(StatisticsHelper::get_has_nulls(column_meta, *has_nulls)); } } @@ -448,8 +455,18 @@ Status FileReader::_read_min_max_chunk(const GroupReaderPtr& group_reader, const // statistics not exist in parquet file return Status::Aborted("No exist statistics"); } else { + size_t num_rows = group_reader->get_row_group_metadata()->num_rows; std::vector min_values; std::vector max_values; + std::vector null_pages; + + // If all values of one group is null, the statistics is like this: + // max=, min=, null_count=3, distinct_count=, max_value=, min_value= + if (column_meta->statistics.__isset.null_count && column_meta->statistics.null_count == num_rows) { + (*min_chunk)->columns()[i]->append_nulls(1); + (*max_chunk)->columns()[i]->append_nulls(1); + continue; + } const ParquetField* field = group_reader->get_column_parquet_field(slot->id()); if (field == nullptr) { @@ -459,10 +476,11 @@ Status FileReader::_read_min_max_chunk(const GroupReaderPtr& group_reader, const RETURN_IF_ERROR(StatisticsHelper::get_min_max_value(_file_metadata.get(), slot->type(), column_meta, field, min_values, max_values)); + null_pages.emplace_back(false); RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column((*min_chunk)->columns()[i], min_values, - slot->type(), field, ctx.timezone)); + null_pages, slot->type(), field, ctx.timezone)); RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column((*max_chunk)->columns()[i], max_values, - slot->type(), field, ctx.timezone)); + null_pages, slot->type(), field, ctx.timezone)); } } diff --git a/be/src/formats/parquet/page_index_reader.cpp b/be/src/formats/parquet/page_index_reader.cpp index 705aea0c3c594..3e556d7a1100e 100644 --- a/be/src/formats/parquet/page_index_reader.cpp +++ b/be/src/formats/parquet/page_index_reader.cpp @@ -97,8 +97,8 @@ Status PageIndexReader::_deal_with_min_max_conjuncts(const std::vectorappend_column(max_column, id); // deal with min_values - auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, type, - _column_readers.at(id)->get_column_parquet_field(), + auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, column_index.null_pages, + type, _column_readers.at(id)->get_column_parquet_field(), _group_reader->_param.timezone); if (!st.ok()) { // swallow error status @@ -107,7 +107,7 @@ Status PageIndexReader::_deal_with_min_max_conjuncts(const std::vectorget_column_parquet_field(), _group_reader->_param.timezone); if (!st.ok()) { @@ -178,13 +178,17 @@ Status PageIndexReader::_deal_with_more_conjunct(const std::vector } } } else if (filter_type == StatisticsHelper::StatSupportedFilter::FILTER_IN) { - RETURN_IF_ERROR(StatisticsHelper::in_filter_on_min_max_stat( - column_index.min_values, column_index.max_values, column_index.null_counts, ctx, field, - timezone, page_filter)); + if (column_index.__isset.null_counts) { + RETURN_IF_ERROR(StatisticsHelper::in_filter_on_min_max_stat( + column_index.min_values, column_index.max_values, column_index.null_pages, + column_index.null_counts, ctx, field, timezone, page_filter)); + } } else if (filter_type == StatisticsHelper::StatSupportedFilter::RF_MIN_MAX) { - RETURN_IF_ERROR(StatisticsHelper::min_max_filter_on_min_max_stat( - column_index.min_values, column_index.max_values, column_index.null_counts, ctx, field, - timezone, page_filter)); + if (column_index.__isset.null_counts) { + RETURN_IF_ERROR(StatisticsHelper::min_max_filter_on_min_max_stat( + column_index.min_values, column_index.max_values, column_index.null_pages, + column_index.null_counts, ctx, field, timezone, page_filter)); + } } } } diff --git a/be/src/formats/parquet/scalar_column_reader.cpp b/be/src/formats/parquet/scalar_column_reader.cpp index e41a88bb670a3..47fb03677ed5b 100644 --- a/be/src/formats/parquet/scalar_column_reader.cpp +++ b/be/src/formats/parquet/scalar_column_reader.cpp @@ -292,6 +292,8 @@ StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vectormeta_data.statistics.__isset.null_count) { has_null = get_chunk_metadata()->meta_data.statistics.null_count > 0; is_all_null = get_chunk_metadata()->meta_data.statistics.null_count == rg_num_rows; + } else { + return true; } std::optional zone_map_detail = std::nullopt; @@ -306,13 +308,14 @@ StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vector min_values; std::vector max_values; + std::vector null_pages{false}; Status st = StatisticsHelper::get_min_max_value(_opts.file_meta_data, *_col_type, &get_chunk_metadata()->meta_data, get_column_parquet_field(), min_values, max_values); if (st.ok()) { - RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, *_col_type, + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, *_col_type, get_column_parquet_field(), _opts.timezone)); - RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, *_col_type, + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, *_col_type, get_column_parquet_field(), _opts.timezone)); zone_map_detail = ZoneMapDetail{min_column->get(0), max_column->get(0), has_null}; @@ -355,11 +358,12 @@ StatusOr ScalarColumnReader::page_index_zone_map_filter(const std::vector< ASSIGN_OR_RETURN(const tparquet::OffsetIndex* offset_index, get_offset_index(rg_first_row)); const size_t page_num = column_index.min_values.size(); + const std::vector null_pages = column_index.null_pages; ColumnPtr min_column = ColumnHelper::create_column(*_col_type, true); ColumnPtr max_column = ColumnHelper::create_column(*_col_type, true); // deal with min_values - auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, *_col_type, + auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, null_pages, *_col_type, get_column_parquet_field(), _opts.timezone); if (!st.ok()) { // swallow error status @@ -367,7 +371,7 @@ StatusOr ScalarColumnReader::page_index_zone_map_filter(const std::vector< return false; } // deal with max_values - st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, *_col_type, + st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, null_pages, *_col_type, get_column_parquet_field(), _opts.timezone); if (!st.ok()) { // swallow error status @@ -379,7 +383,6 @@ StatusOr ScalarColumnReader::page_index_zone_map_filter(const std::vector< DCHECK_EQ(page_num, max_column->size()); // fill ZoneMapDetail - const std::vector null_pages = column_index.null_pages; std::vector zone_map_details{}; for (size_t i = 0; i < page_num; i++) { if (null_pages[i]) { diff --git a/be/src/formats/parquet/statistics_helper.cpp b/be/src/formats/parquet/statistics_helper.cpp index 81d37c884a306..9831c86293aa4 100644 --- a/be/src/formats/parquet/statistics_helper.cpp +++ b/be/src/formats/parquet/statistics_helper.cpp @@ -39,8 +39,8 @@ namespace starrocks::parquet { Status StatisticsHelper::decode_value_into_column(const ColumnPtr& column, const std::vector& values, - const TypeDescriptor& type, const ParquetField* field, - const std::string& timezone) { + const std::vector& null_pages, const TypeDescriptor& type, + const ParquetField* field, const std::string& timezone) { std::unique_ptr converter; RETURN_IF_ERROR(ColumnConverterFactory::create_converter(*field, type, timezone, &converter)); bool ret = true; @@ -49,14 +49,22 @@ Status StatisticsHelper::decode_value_into_column(const ColumnPtr& column, const int32_t decode_value = 0; if (!converter->need_convert) { for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= (column->append_numbers(&decode_value, sizeof(int32_t)) > 0); + if (null_pages[i]) { + ret &= column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= (column->append_numbers(&decode_value, sizeof(int32_t)) > 0); + } } } else { ColumnPtr src_column = converter->create_src_column(); for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= (src_column->append_numbers(&decode_value, sizeof(int32_t)) > 0); + if (null_pages[i]) { + ret &= src_column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= (src_column->append_numbers(&decode_value, sizeof(int32_t)) > 0); + } } RETURN_IF_ERROR(converter->convert(src_column, column.get())); } @@ -66,14 +74,22 @@ Status StatisticsHelper::decode_value_into_column(const ColumnPtr& column, const int64_t decode_value = 0; if (!converter->need_convert) { for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= (column->append_numbers(&decode_value, sizeof(int64_t)) > 0); + if (null_pages[i]) { + ret &= column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= (column->append_numbers(&decode_value, sizeof(int64_t)) > 0); + } } } else { ColumnPtr src_column = converter->create_src_column(); for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= (src_column->append_numbers(&decode_value, sizeof(int64_t)) > 0); + if (null_pages[i]) { + ret &= src_column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= (src_column->append_numbers(&decode_value, sizeof(int64_t)) > 0); + } } RETURN_IF_ERROR(converter->convert(src_column, column.get())); } @@ -85,14 +101,22 @@ Status StatisticsHelper::decode_value_into_column(const ColumnPtr& column, const Slice decode_value; if (!converter->need_convert) { for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= column->append_strings(std::vector{decode_value}); + if (null_pages[i]) { + ret &= column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= column->append_strings(std::vector{decode_value}); + } } } else { ColumnPtr src_column = converter->create_src_column(); for (size_t i = 0; i < values.size(); i++) { - RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); - ret &= src_column->append_strings(std::vector{decode_value}); + if (null_pages[i]) { + ret &= src_column->append_nulls(1); + } else { + RETURN_IF_ERROR(PlainDecoder::decode(values[i], &decode_value)); + ret &= src_column->append_strings(std::vector{decode_value}); + } } RETURN_IF_ERROR(converter->convert(src_column, column.get())); } @@ -197,6 +221,7 @@ void translate_to_string_value(const ColumnPtr& col, size_t i, std::string& valu Status StatisticsHelper::min_max_filter_on_min_max_stat(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected) { @@ -205,8 +230,8 @@ Status StatisticsHelper::min_max_filter_on_min_max_stat(const std::vector(min_values, max_values, null_counts, ctx, field, \ - timezone, selected); \ + return min_max_filter_on_min_max_stat_t(min_values, max_values, null_pages, null_counts, \ + ctx, field, timezone, selected); \ } APPLY_FOR_ALL_SCALAR_TYPE(M); #undef M @@ -218,6 +243,7 @@ Status StatisticsHelper::min_max_filter_on_min_max_stat(const std::vector Status StatisticsHelper::min_max_filter_on_min_max_stat_t(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected) { @@ -231,10 +257,10 @@ Status StatisticsHelper::min_max_filter_on_min_max_stat_t(const std::vectorget_min_value(); auto rf_max_value = min_max_filter->get_max_value(); - RETURN_IF_ERROR( - StatisticsHelper::decode_value_into_column(min_column, min_values, root_expr->type(), field, timezone)); - RETURN_IF_ERROR( - StatisticsHelper::decode_value_into_column(max_column, max_values, root_expr->type(), field, timezone)); + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, root_expr->type(), + field, timezone)); + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, root_expr->type(), + field, timezone)); for (size_t i = 0; i < min_values.size(); i++) { if (!selected[i]) { @@ -244,6 +270,10 @@ Status StatisticsHelper::min_max_filter_on_min_max_stat_t(const std::vector(min_column.get())->get_data()[i]; auto zonemap_max_v = ColumnHelper::get_data_column_by_type(max_column.get())->get_data()[i]; @@ -258,6 +288,7 @@ Status StatisticsHelper::min_max_filter_on_min_max_stat_t(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected) { @@ -291,13 +322,11 @@ Status StatisticsHelper::in_filter_on_min_max_stat(const std::vector ColumnPtr min_col = ColumnHelper::create_column(c->type(), true); min_col->reserve(min_values.size()); - RETURN_IF_ERROR(decode_value_into_column(min_col, min_values, c->type(), field, timezone)); - DCHECK(!min_col->has_null()); + RETURN_IF_ERROR(decode_value_into_column(min_col, min_values, null_pages, c->type(), field, timezone)); min_col = down_cast(min_col.get())->data_column(); ColumnPtr max_col = ColumnHelper::create_column(c->type(), true); max_col->reserve(max_values.size()); - RETURN_IF_ERROR(decode_value_into_column(max_col, max_values, c->type(), field, timezone)); - DCHECK(!max_col->has_null()); + RETURN_IF_ERROR(decode_value_into_column(max_col, max_values, null_pages, c->type(), field, timezone)); max_col = down_cast(max_col.get())->data_column(); // logic and example: @@ -323,6 +352,10 @@ Status StatisticsHelper::in_filter_on_min_max_stat(const std::vector& values, - const TypeDescriptor& type, const ParquetField* field, - const std::string& timezone); + const std::vector& null_pages, const TypeDescriptor& type, + const ParquetField* field, const std::string& timezone); static bool can_be_used_for_statistics_filter(ExprContext* ctx, StatSupportedFilter& filter_type); static Status in_filter_on_min_max_stat(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected); static Status min_max_filter_on_min_max_stat(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected); @@ -47,6 +49,7 @@ class StatisticsHelper { template static Status min_max_filter_on_min_max_stat_t(const std::vector& min_values, const std::vector& max_values, + const std::vector& null_pages, const std::vector& null_counts, ExprContext* ctx, const ParquetField* field, const std::string& timezone, Filter& selected); diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index 59e993b0bf435..c73ef5c171dc5 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -364,6 +364,8 @@ struct TypeDescriptor { }; static const TypeDescriptor TYPE_UNKNOWN_DESC = TypeDescriptor(LogicalType::TYPE_UNKNOWN); +static const TypeDescriptor TYPE_BOOLEAN_DESC = TypeDescriptor{LogicalType::TYPE_BOOLEAN}; +static const TypeDescriptor TYPE_SMALLINT_DESC = TypeDescriptor{LogicalType::TYPE_SMALLINT}; static const TypeDescriptor TYPE_INT_DESC = TypeDescriptor(LogicalType::TYPE_INT); static const TypeDescriptor TYPE_BIGINT_DESC = TypeDescriptor(LogicalType::TYPE_BIGINT); static const TypeDescriptor TYPE_TIME_DESC = TypeDescriptor(LogicalType::TYPE_TIME); diff --git a/be/src/testutil/exprs_test_helper.h b/be/src/testutil/exprs_test_helper.h index 5337cbf54f482..dea0fa8182f6c 100644 --- a/be/src/testutil/exprs_test_helper.h +++ b/be/src/testutil/exprs_test_helper.h @@ -47,19 +47,48 @@ class ExprsTestHelper { return expr; } + static TTypeDesc create_decimal_type_desc(const TPrimitiveType::type type, int32_t precision, int32_t scale) { + TScalarType scalar_type; + scalar_type.__set_type(type); + scalar_type.__set_precision(precision); + scalar_type.__set_scale(scale); + + TTypeNode type_node; + type_node.__set_type(TTypeNodeType::SCALAR); + type_node.__set_scalar_type(scalar_type); + + TTypeDesc type_desc; + type_desc.types.push_back(type_node); + return type_desc; + } + + static TTypeDesc create_varchar_type_desc(int32_t length) { + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.len = length; + + TTypeNode type_node; + type_node.__set_type(TTypeNodeType::SCALAR); + type_node.__set_scalar_type(scalar_type); + + TTypeDesc type_desc; + type_desc.types.push_back(type_node); + + return type_desc; + } + static TTypeDesc create_scalar_type_desc(const TPrimitiveType::type t_type) { - TTypeDesc type; + TScalarType scalar_type; + scalar_type.__set_type(t_type); - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(t_type); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } + TTypeNode type_node; + type_node.__set_type(TTypeNodeType::SCALAR); + type_node.__set_scalar_type(scalar_type); + + TTypeDesc type_desc; + type_desc.types.push_back(type_node); - return type; + return type_desc; } static TSlotDescriptor create_slot_desc(const TTypeDesc& type, TupleId tuple_id, SlotId slot_id, @@ -128,6 +157,17 @@ class ExprsTestHelper { return std::unique_ptr(expr); } + static TExprNode create_binary_pred_node(TPrimitiveType::type type, TExprOpcode::type opcode) { + TExprNode node; + node.node_type = TExprNodeType::BINARY_PRED; + node.num_children = 2; + node.__set_opcode(opcode); + node.__set_child_type(type); + node.type = gen_type_desc(TPrimitiveType::BOOLEAN); + + return node; + } + static TExprNode create_slot_expr_node(TupleId tuple_id, SlotId slot_id, TTypeDesc t_type, bool is_nullable) { TExprNode slot_ref; slot_ref.node_type = TExprNodeType::SLOT_REF; @@ -140,6 +180,63 @@ class ExprsTestHelper { return slot_ref; } + template + static TExprNode create_int_literal(CppType value, TTypeDesc t_type, bool is_nullable) { + TIntLiteral int_literal; + int_literal.value = value; + + TExprNode node; + node.node_type = TExprNodeType::INT_LITERAL; + node.type = t_type; + node.num_children = 0; + node.__set_int_literal(int_literal); + node.is_nullable = is_nullable; + + return node; + } + + static TExprNode create_date_literal(const std::string& value, TTypeDesc t_type, bool is_nullable) { + TDateLiteral date_literal; + date_literal.value = value; + + TExprNode node; + node.node_type = TExprNodeType::DATE_LITERAL; + node.type = t_type; + node.num_children = 0; + node.__set_date_literal(date_literal); + node.is_nullable = is_nullable; + + return node; + } + + static TExprNode create_varchar_literal(const std::string& value, TTypeDesc t_type, bool is_nullable) { + TStringLiteral string_literal; + string_literal.value = value; + + TExprNode node; + node.node_type = TExprNodeType::STRING_LITERAL; + node.type = t_type; + node.num_children = 0; + node.__set_string_literal(string_literal); + node.is_nullable = is_nullable; + + return node; + } + + static TExprNode create_decimal_literal(const std::string& value, TTypeDesc t_type, bool is_nullable) { + TDecimalLiteral decimal_literal; + decimal_literal.value = value; + + TExprNode node; + node.node_type = TExprNodeType::DECIMAL_LITERAL; + node.type = t_type; + node.num_children = 0; + node.__set_decimal_literal(decimal_literal); + node.is_nullable = is_nullable; + + return node; + } + static TExpr create_slot_expr(TExprNode slot_ref) { TExpr expr; expr.nodes.push_back(slot_ref); diff --git a/be/test/formats/parquet/file_reader_test.cpp b/be/test/formats/parquet/file_reader_test.cpp index face8d4e85f92..17a06844e4c5e 100644 --- a/be/test/formats/parquet/file_reader_test.cpp +++ b/be/test/formats/parquet/file_reader_test.cpp @@ -27,6 +27,7 @@ #include "exec/hdfs_scanner.h" #include "exprs/binary_predicate.h" #include "exprs/expr_context.h" +#include "exprs/in_const_predicate.hpp" #include "formats/parquet/column_chunk_reader.h" #include "formats/parquet/metadata.h" #include "formats/parquet/page_reader.h" @@ -37,6 +38,7 @@ #include "runtime/descriptor_helper.h" #include "runtime/mem_tracker.h" #include "testutil/assert.h" +#include "testutil/column_test_helper.h" #include "testutil/exprs_test_helper.h" namespace starrocks::parquet { @@ -46,7 +48,10 @@ using starrocks::HdfsScannerContext; class FileReaderTest : public testing::Test { public: - void SetUp() override { _runtime_state = _pool.add(new RuntimeState(TQueryGlobals())); } + void SetUp() override { + _runtime_state = _pool.add(new RuntimeState(TQueryGlobals())); + _rf_probe_collector = _pool.add(new RuntimeFilterProbeCollector()); + } void TearDown() override {} protected: @@ -55,7 +60,13 @@ class FileReaderTest : public testing::Test { std::unique_ptr _create_file(const std::string& file_path); DataCacheOptions _mock_datacache_options(); + std::shared_ptr _create_file_reader(const std::string& file_path, int64_t chunk_size = 4096); + HdfsScannerContext* _create_scan_context(); + HdfsScannerContext* _create_scan_context(Utils::SlotDesc* slot_descs, const std::string& file_path, + int64_t scan_length = 0); + HdfsScannerContext* _create_scan_context(Utils::SlotDesc* slot_descs, Utils::SlotDesc* min_max_slot_descs, + const std::string& file_path, int64_t scan_length = 0); HdfsScannerContext* _create_file1_base_context(); HdfsScannerContext* _create_context_for_partition(); @@ -82,13 +93,27 @@ class FileReaderTest : public testing::Test { HdfsScannerContext* _create_file_map_base_context(); HdfsScannerContext* _create_file_map_partial_materialize_context(); + StatusOr _create_context_for_in_filter(SlotId slot_id); + StatusOr _create_context_for_in_filter_normal(SlotId slot_id); + StatusOr _create_context_for_min_max_all_null_group(SlotId slot_id); + StatusOr _create_context_for_has_null_page_bool(SlotId slot_id); + StatusOr _create_context_for_has_null_page_smallint(SlotId slot_id); + StatusOr _create_context_for_has_null_page_int32(SlotId slot_id); + StatusOr _create_context_for_has_null_page_int64(SlotId slot_id); + StatusOr _create_context_for_has_null_page_datetime(SlotId slot_id); + StatusOr _create_context_for_has_null_page_string(SlotId slot_id); + StatusOr _create_context_for_has_null_page_decimal(SlotId slot_id); + + StatusOr _create_in_const_pred(SlotId slot_id, const std::vector& values, bool has_null, + bool is_runtime_filter); + StatusOr _create_context_for_filter_row_group_1(SlotId slot_id, int32_t start, int32_t end, bool has_null); StatusOr _create_context_for_filter_page_index(SlotId slot_id, int32_t start, int32_t end, bool has_null); - HdfsScannerContext* _create_file_random_read_context(const std::string& file_path); + HdfsScannerContext* _create_file_random_read_context(const std::string& file_path, Utils::SlotDesc* slot_descs); HdfsScannerContext* _create_file_struct_in_struct_read_context(const std::string& file_path); @@ -116,8 +141,10 @@ class FileReaderTest : public testing::Test { THdfsScanRange* _create_scan_range(const std::string& file_path, size_t scan_length = 0); // Description: A simple parquet file that all columns are null + // one row group // - // c1 c2 c3 c4 + // col1 col2 col3 col4 + // int bigint varchar datetime // ------------------------------------------- // NULL NULL NULL NULL // NULL NULL NULL NULL @@ -266,10 +293,38 @@ class FileReaderTest : public testing::Test { std::shared_ptr _row_desc = nullptr; RuntimeState* _runtime_state = nullptr; ObjectPool _pool; + const size_t _chunk_size = 4096; std::string _filter_page_index_with_rf_has_null = "./be/test/formats/parquet/test_data/filter_page_index_with_rf_has_null.parquet"; + + // c1 c2 c3 + // (int32) (int64) (int32, no group stats) + // ====================== + // null null null + // null null null + // null null null + // null null null + // null null null + // null null null + std::string _all_null_parquet_file = "./be/test/formats/parquet/test_data/all_null.parquet"; + std::string _has_null_page_file = "./be/test/formats/parquet/test_data/has_null_page.parquet"; + + RuntimeFilterProbeCollector* _rf_probe_collector; + const TypeDescriptor TYPE_DECIMAL128_DESC = TypeDescriptor::create_decimalv3_type(TYPE_DECIMAL128, 27, 9); + const TypeDescriptor TYPE_INT_ARRAY_ARRAY_DESC = TypeDescriptor::create_array_type(TYPE_INT_ARRAY_DESC); + const TypeDescriptor TYPE_INT_INT_MAP_DESC = TypeDescriptor::create_map_type(TYPE_INT_DESC, TYPE_INT_DESC); + const TypeDescriptor TYPE_CHAR_INT_MAP_DESC = TypeDescriptor::create_map_type(TYPE_CHAR_DESC, TYPE_INT_DESC); + const TypeDescriptor TYPE_VARCHAR_INT_MAP_DESC = TypeDescriptor::create_map_type(TYPE_VARCHAR_DESC, TYPE_INT_DESC); + const TypeDescriptor TYPE_VARBINARY_INT_MAP_DESC = + TypeDescriptor::create_map_type(TYPE_VARBINARY_DESC, TYPE_INT_DESC); + const TypeDescriptor TYPE_VARCHAR_INTARRAY_MAP_DESC = + TypeDescriptor::create_map_type(TYPE_VARCHAR_DESC, TYPE_INT_ARRAY_DESC); + const TypeDescriptor TYPE_VARCHAR_UNKNOWN_MAP_DESC = + TypeDescriptor::create_map_type(TYPE_VARCHAR_DESC, TYPE_UNKNOWN_DESC); + const TypeDescriptor TYPE_UNKNOWN_INTARRAY_MAP_DESC = + TypeDescriptor::create_map_type(TYPE_UNKNOWN_DESC, TYPE_INT_ARRAY_DESC); }; StatusOr FileReaderTest::gen_runtime_filter_desc(SlotId slot_id) { @@ -312,12 +367,42 @@ HdfsScannerContext* FileReaderTest::_create_scan_context() { ctx->lazy_column_coalesce_counter = lazy_column_coalesce_counter; ctx->timezone = "Asia/Shanghai"; ctx->stats = &g_hdfs_scan_stats; + ctx->runtime_filter_collector = _rf_probe_collector; return ctx; } -HdfsScannerContext* FileReaderTest::_create_file1_base_context() { - auto ctx = _create_scan_context(); +std::shared_ptr FileReaderTest::_create_file_reader(const std::string& file_path, int64_t chunk_size) { + auto file = _create_file(file_path); + auto* file_ptr = _pool.add(file.release()); + uint64_t file_size = std::filesystem::file_size(file_path); + return std::make_shared(chunk_size, file_ptr, file_size, _mock_datacache_options()); +} + +HdfsScannerContext* FileReaderTest::_create_scan_context(Utils::SlotDesc* slot_descs, const std::string& file_path, + int64_t scan_length) { + auto* ctx = _pool.add(new HdfsScannerContext()); + auto* lazy_column_coalesce_counter = _pool.add(new std::atomic(0)); + ctx->lazy_column_coalesce_counter = lazy_column_coalesce_counter; + ctx->timezone = "Asia/Shanghai"; + ctx->stats = &g_hdfs_scan_stats; + ctx->runtime_filter_collector = _rf_probe_collector; + TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); + Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); + ctx->slot_descs = tuple_desc->slots(); + ctx->scan_range = _create_scan_range(file_path, scan_length); + return ctx; +} + +HdfsScannerContext* FileReaderTest::_create_scan_context(Utils::SlotDesc* slot_descs, + Utils::SlotDesc* min_max_slot_descs, + const std::string& file_path, int64_t scan_length) { + auto* ctx = _create_scan_context(slot_descs, file_path, scan_length); + ctx->min_max_tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, min_max_slot_descs); + return ctx; +} + +HdfsScannerContext* FileReaderTest::_create_file1_base_context() { Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, {"c2", TYPE_BIGINT_DESC}, @@ -325,50 +410,30 @@ HdfsScannerContext* FileReaderTest::_create_file1_base_context() { {"c4", TYPE_DATETIME_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file1_path, 1024)); - - return ctx; + return _create_scan_context(slot_descs, _file1_path, 1024); } HdfsScannerContext* FileReaderTest::_create_context_for_partition() { - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = { {"c5", TYPE_INT_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file1_path, 1024)); + auto ctx = _create_scan_context(slot_descs, _file1_path, 1024); + auto column = ColumnHelper::create_const_column(1, 1); ctx->partition_values.emplace_back(column); - return ctx; } HdfsScannerContext* FileReaderTest::_create_context_for_not_exist() { - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = { {"c5", TYPE_INT_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file1_path, 1024)); - - return ctx; + return _create_scan_context(slot_descs, _file1_path, 1024); } HdfsScannerContext* FileReaderTest::_create_file2_base_context() { - auto ctx = _create_scan_context(); - // tuple desc and conjuncts Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, @@ -377,22 +442,23 @@ HdfsScannerContext* FileReaderTest::_create_file2_base_context() { {"c4", TYPE_DATETIME_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file2_path, 850)); - - return ctx; + return _create_scan_context(slot_descs, _file2_path, 850); } HdfsScannerContext* FileReaderTest::_create_context_for_min_max() { - auto* ctx = _create_file2_base_context(); - + Utils::SlotDesc slot_descs[] = { + {"c1", TYPE_INT_DESC}, + {"c2", TYPE_BIGINT_DESC}, + {"c3", TYPE_VARCHAR_DESC}, + {"c4", TYPE_DATETIME_DESC}, + {""}, + }; Utils::SlotDesc min_max_slots[] = { {"c1", TYPE_INT_DESC}, {""}, }; - ctx->min_max_tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, min_max_slots); + + auto* ctx = _create_scan_context(slot_descs, min_max_slots, _file2_path, 850); // create min max conjuncts // c1 >= 1 @@ -401,15 +467,11 @@ HdfsScannerContext* FileReaderTest::_create_context_for_min_max() { } HdfsScannerContext* FileReaderTest::_create_context_for_filter_file() { - auto* ctx = _create_file2_base_context(); - Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, {"c2", TYPE_BIGINT_DESC}, {"c3", TYPE_VARCHAR_DESC}, {"c4", TYPE_DATETIME_DESC}, {"c5", TYPE_INT_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); + auto* ctx = _create_scan_context(slot_descs, _file2_path, 850); // create conjuncts // c5 >= 1 _create_int_conjunct_ctxs(TExprOpcode::GE, 4, 1, &ctx->conjunct_ctxs_by_slot[4]); @@ -441,19 +503,12 @@ HdfsScannerContext* FileReaderTest::_create_context_for_skip_group() { } HdfsScannerContext* FileReaderTest::_create_file3_base_context() { - auto ctx = _create_scan_context(); - // tuple desc and conjuncts Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, {"c2", TYPE_BIGINT_DESC}, {"c3", TYPE_VARCHAR_DESC}, {"c4", TYPE_DATETIME_DESC}, {"c5", TYPE_VARCHAR_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file3_path)); - - return ctx; + return _create_scan_context(slot_descs, _file3_path); } HdfsScannerContext* FileReaderTest::_create_context_for_multi_filter() { @@ -470,8 +525,6 @@ HdfsScannerContext* FileReaderTest::_create_context_for_late_materialization() { } HdfsScannerContext* FileReaderTest::_create_file4_base_context() { - auto ctx = _create_scan_context(); - // tuple desc and conjuncts // struct columns are not supported now, so we skip reading them Utils::SlotDesc slot_descs[] = { @@ -480,32 +533,16 @@ HdfsScannerContext* FileReaderTest::_create_file4_base_context() { {"B1", TYPE_VARCHAR_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); - - return ctx; + return _create_scan_context(slot_descs, _file4_path); } HdfsScannerContext* FileReaderTest::_create_file5_base_context() { - auto ctx = _create_scan_context(); - - TypeDescriptor type_outer(LogicalType::TYPE_ARRAY); - type_outer.children.emplace_back(TYPE_INT_ARRAY_DESC); - - // tuple desc Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, - {"c2", type_outer}, + {"c2", TYPE_INT_ARRAY_ARRAY_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file5_path)); - - return ctx; + return _create_scan_context(slot_descs, _file5_path); } HdfsScannerContext* FileReaderTest::_create_context_for_struct_column() { @@ -525,8 +562,6 @@ HdfsScannerContext* FileReaderTest::_create_context_for_upper_pred() { } HdfsScannerContext* FileReaderTest::_create_file6_base_context() { - auto ctx = _create_scan_context(); - // tuple desc and conjuncts // struct columns are not supported now, so we skip reading them Utils::SlotDesc slot_descs[] = { @@ -534,26 +569,152 @@ HdfsScannerContext* FileReaderTest::_create_file6_base_context() { {"col_array", TYPE_INT_ARRAY_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file6_path)); + return _create_scan_context(slot_descs, _file6_path); +} - return ctx; +StatusOr FileReaderTest::_create_in_const_pred(SlotId slot_id, const std::vector& values, + bool has_null, bool is_runtime_filter) { + ColumnRef* col_ref = _pool.add(new ColumnRef(TYPE_INT_DESC, slot_id)); + VectorizedInConstPredicateBuilder builder(_runtime_state, &_pool, col_ref); + RETURN_IF_ERROR(builder.create()); + + ExprContext* expr_ctx = builder.get_in_const_predicate(); + RETURN_IF_ERROR(expr_ctx->prepare(_runtime_state)); + + auto* in_pred = reinterpret_cast*>(expr_ctx->root()); + for (auto& v : values) { + in_pred->insert(v); + } + if (has_null) { + in_pred->insert_null(); + } + in_pred->set_is_join_runtime_filter(is_runtime_filter); + RETURN_IF_ERROR(expr_ctx->open(_runtime_state)); + return expr_ctx; +} + +StatusOr FileReaderTest::_create_context_for_in_filter(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = { + {"c1", TYPE_INT_DESC, 1}, {"c2", TYPE_BIGINT_DESC, 2}, {"c3", TYPE_VARCHAR_DESC, 3}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, _all_null_parquet_file); + + std::vector values{1, 3, 5}; + ASSIGN_OR_RETURN(auto* expr_ctx, _create_in_const_pred(slot_id, values, true, false)); + + std::vector expr_ctxs{expr_ctx}; + scan_ctx->conjunct_ctxs_by_slot.insert({slot_id, expr_ctxs}); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_in_filter_normal(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"col1", TYPE_INT_DESC, 1}, {"col2", TYPE_INT_DESC, 2}, {"col3", TYPE_INT_DESC, 3}, + {"col4", TYPE_INT_DESC, 4}, {"col5", TYPE_INT_DESC, 5}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, _filter_row_group_path_1); + + std::vector values{5, 6}; + ASSIGN_OR_RETURN(auto* expr_ctx, _create_in_const_pred(slot_id, values, false, false)); + + std::vector expr_ctxs{expr_ctx}; + scan_ctx->conjunct_ctxs_by_slot.insert({slot_id, expr_ctxs}); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_min_max_all_null_group(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c1", TYPE_INT_DESC}, {"c2", TYPE_BIGINT_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _all_null_parquet_file); + + std::vector t_conjuncts; + ParquetUTBase::append_int_conjunct(TExprOpcode::GE, slot_id, 4, &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_bool(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_bool", TYPE_BOOLEAN_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_int_conjunct(TExprOpcode::GE, slot_id, false, &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_smallint(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_smallint", TYPE_SMALLINT_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_smallint_conjunct(TExprOpcode::GT, slot_id, 3, &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_int32(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_int32", TYPE_INT_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_int_conjunct(TExprOpcode::GT, slot_id, 33, &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_int64(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_int64", TYPE_BIGINT_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_bigint_conjunct(TExprOpcode::GT, slot_id, 333, &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_string(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_string", TYPE_VARCHAR_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_string_conjunct(TExprOpcode::GT, slot_id, "33333", &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_decimal(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_decimal", TYPE_DECIMAL128_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_decimal_conjunct(TExprOpcode::GT, slot_id, "333.300000000", &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; +} + +StatusOr FileReaderTest::_create_context_for_has_null_page_datetime(SlotId slot_id) { + Utils::SlotDesc slot_descs[] = {{"c_datetime", TYPE_DATETIME_DESC}, {""}}; + auto scan_ctx = _create_scan_context(slot_descs, slot_descs, _has_null_page_file); + + std::vector t_conjuncts; + ParquetUTBase::append_datetime_conjunct(TExprOpcode::GT, slot_id, "2024-01-10 00:00:00", &t_conjuncts); + ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &scan_ctx->min_max_conjunct_ctxs); + + return scan_ctx; } StatusOr FileReaderTest::_create_context_for_filter_row_group_1(SlotId slot_id, int32_t start, int32_t end, bool has_null) { - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = {{"col1", TYPE_INT_DESC, 1}, {"col2", TYPE_INT_DESC, 2}, {"col3", TYPE_INT_DESC, 3}, {"col4", TYPE_INT_DESC, 4}, {"col5", TYPE_INT_DESC, 5}, {""}}; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = _create_scan_range(_filter_row_group_path_1); + auto ctx = _create_scan_context(slot_descs, _filter_row_group_path_1); - auto* rf_collector = _pool.add(new RuntimeFilterProbeCollector()); auto* rf = _pool.add(new RuntimeBloomFilter()); ASSIGN_OR_RETURN(auto* rf_desc, gen_runtime_filter_desc(slot_id)); @@ -566,15 +727,14 @@ StatusOr FileReaderTest::_create_context_for_filter_row_gro } rf_desc->set_runtime_filter(rf); - rf_collector->add_descriptor(rf_desc); - ctx->runtime_filter_collector = rf_collector; + _rf_probe_collector->add_descriptor(rf_desc); ColumnPtr partition_col3 = ColumnHelper::create_const_column(5, 1); ColumnPtr partition_col4 = ColumnHelper::create_const_column(2, 1); ColumnPtr partition_col5 = ColumnHelper::create_const_null_column(1); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{3, tuple_desc->slots()[2], false}); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{4, tuple_desc->slots()[3], false}); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{5, tuple_desc->slots()[3], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{3, ctx->slot_descs[2], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{4, ctx->slot_descs[3], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{5, ctx->slot_descs[4], false}); ctx->partition_values.emplace_back(partition_col3); ctx->partition_values.emplace_back(partition_col4); ctx->partition_values.emplace_back(partition_col5); @@ -584,19 +744,12 @@ StatusOr FileReaderTest::_create_context_for_filter_row_gro StatusOr FileReaderTest::_create_context_for_filter_page_index(SlotId slot_id, int32_t start, int32_t end, bool has_null) { - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = {{"lo_orderkey", TYPE_INT_DESC, 1}, {"col2", TYPE_INT_DESC, 2}, {"col3", TYPE_INT_DESC, 3}, {"col4", TYPE_INT_DESC, 4}, {"col5", TYPE_INT_DESC, 5}, {""}}; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = _create_scan_range(_filter_row_group_path_1); + auto ctx = _create_scan_context(slot_descs, _filter_row_group_path_1); - auto* rf_collector = _pool.add(new RuntimeFilterProbeCollector()); auto* rf = _pool.add(new RuntimeBloomFilter()); - ASSIGN_OR_RETURN(auto* rf_desc, gen_runtime_filter_desc(slot_id)); rf->init(10); @@ -607,15 +760,14 @@ StatusOr FileReaderTest::_create_context_for_filter_page_in } rf_desc->set_runtime_filter(rf); - rf_collector->add_descriptor(rf_desc); - ctx->runtime_filter_collector = rf_collector; + _rf_probe_collector->add_descriptor(rf_desc); ColumnPtr partition_col3 = ColumnHelper::create_const_column(5, 1); ColumnPtr partition_col4 = ColumnHelper::create_const_column(2, 1); ColumnPtr partition_col5 = ColumnHelper::create_const_null_column(1); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{3, tuple_desc->slots()[2], false}); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{4, tuple_desc->slots()[3], false}); - ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{5, tuple_desc->slots()[3], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{3, ctx->slot_descs[2], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{4, ctx->slot_descs[3], false}); + ctx->partition_columns.emplace_back(HdfsScannerContext::ColumnInfo{5, ctx->slot_descs[4], false}); ctx->partition_values.emplace_back(partition_col3); ctx->partition_values.emplace_back(partition_col4); ctx->partition_values.emplace_back(partition_col5); @@ -632,118 +784,51 @@ StatusOr FileReaderTest::_create_context_for_filter_page_in } HdfsScannerContext* FileReaderTest::_create_file_map_char_key_context() { - auto ctx = _create_scan_context(); - - TypeDescriptor type_map_char(LogicalType::TYPE_MAP); - type_map_char.children.emplace_back(TYPE_CHAR_DESC); - type_map_char.children.emplace_back(TYPE_INT_DESC); - - TypeDescriptor type_map_varchar(LogicalType::TYPE_MAP); - type_map_varchar.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_varchar.children.emplace_back(TYPE_INT_DESC); - Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, - {"c2", type_map_char}, - {"c3", type_map_varchar}, + {"c2", TYPE_CHAR_INT_MAP_DESC}, + {"c3", TYPE_VARCHAR_INT_MAP_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_map_char_key_path)); - - return ctx; + return _create_scan_context(slot_descs, _file_map_char_key_path); } HdfsScannerContext* FileReaderTest::_create_file_map_base_context() { - auto ctx = _create_scan_context(); - - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_INT_DESC); - - TypeDescriptor type_map_map(LogicalType::TYPE_MAP); - type_map_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_map.children.emplace_back(type_map); - - TypeDescriptor type_map_array(LogicalType::TYPE_MAP); - type_map_array.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_array.children.emplace_back(TYPE_INT_ARRAY_DESC); - - // tuple desc + const TypeDescriptor type_map_map = TypeDescriptor::create_map_type(TYPE_VARCHAR_DESC, TYPE_VARCHAR_INT_MAP_DESC); Utils::SlotDesc slot_descs[] = { - {"c1", TYPE_INT_DESC}, {"c2", type_map}, {"c3", type_map_map}, {"c4", type_map_array}, {""}, + {"c1", TYPE_INT_DESC}, + {"c2", TYPE_VARCHAR_INT_MAP_DESC}, + {"c3", type_map_map}, + {"c4", TYPE_VARCHAR_INTARRAY_MAP_DESC}, + {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_map_path)); - - return ctx; + return _create_scan_context(slot_descs, _file_map_path); } HdfsScannerContext* FileReaderTest::_create_file_map_partial_materialize_context() { - auto ctx = _create_scan_context(); - - TypeDescriptor type_map(LogicalType::TYPE_MAP); - // only key will be materialized - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_UNKNOWN_DESC); - - TypeDescriptor type_map_map(LogicalType::TYPE_MAP); - // the first level value will be materialized, and the second level key will be materialized - type_map_map.children.emplace_back(TYPE_UNKNOWN_DESC); - type_map_map.children.emplace_back(type_map); - - // only value will be materialized - TypeDescriptor type_map_array(LogicalType::TYPE_MAP); - type_map_array.children.emplace_back(TYPE_UNKNOWN_DESC); - type_map_array.children.emplace_back(TYPE_INT_ARRAY_DESC); + TypeDescriptor type_map_map = TypeDescriptor::create_map_type(TYPE_UNKNOWN_DESC, TYPE_VARCHAR_UNKNOWN_MAP_DESC); // tuple desc Utils::SlotDesc slot_descs[] = { - {"c1", TYPE_INT_DESC}, {"c2", type_map}, {"c3", type_map_map}, {"c4", type_map_array}, {""}, + {"c1", TYPE_INT_DESC}, + {"c2", TYPE_VARCHAR_UNKNOWN_MAP_DESC}, + {"c3", type_map_map}, + {"c4", TYPE_UNKNOWN_INTARRAY_MAP_DESC}, + {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_map_path)); - - return ctx; + return _create_scan_context(slot_descs, _file_map_path); } -HdfsScannerContext* FileReaderTest::_create_file_random_read_context(const std::string& file_path) { - auto ctx = _create_scan_context(); - - // tuple desc - Utils::SlotDesc slot_descs[] = { - {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}, - }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(file_path)); - - return ctx; +HdfsScannerContext* FileReaderTest::_create_file_random_read_context(const std::string& file_path, + Utils::SlotDesc* slot_descs) { + return _create_scan_context(slot_descs, file_path); } HdfsScannerContext* FileReaderTest::_create_file_struct_in_struct_read_context(const std::string& file_path) { - auto ctx = _create_scan_context(); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_struct.field_names.emplace_back("c0"); - - type_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_struct.field_names.emplace_back("c1"); - - TypeDescriptor type_struct_in_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct_in_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_struct_in_struct.field_names.emplace_back("c0"); - - type_struct_in_struct.children.emplace_back(type_struct); - type_struct_in_struct.field_names.emplace_back("c_struct"); + TypeDescriptor type_struct = + TypeDescriptor::create_struct_type({"c0", "c1"}, {TYPE_VARCHAR_DESC, TYPE_VARCHAR_DESC}); + TypeDescriptor type_struct_in_struct = + TypeDescriptor::create_struct_type({"c0", "c_struct"}, {TYPE_VARCHAR_DESC, type_struct}); // tuple desc Utils::SlotDesc slot_descs[] = { @@ -753,25 +838,13 @@ HdfsScannerContext* FileReaderTest::_create_file_struct_in_struct_read_context(c {"c_struct_struct", type_struct_in_struct}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(file_path)); - - return ctx; + return _create_scan_context(slot_descs, file_path); } HdfsScannerContext* FileReaderTest::_create_file_struct_in_struct_prune_and_no_output_read_context( const std::string& file_path) { - auto ctx = _create_scan_context(); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_struct.field_names.emplace_back("c0"); - - TypeDescriptor type_struct_in_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct_in_struct.children.emplace_back(type_struct); - type_struct_in_struct.field_names.emplace_back("c_struct"); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"c0"}, {TYPE_VARCHAR_DESC}); + TypeDescriptor type_struct_in_struct = TypeDescriptor::create_struct_type({"c_struct"}, {type_struct}); // tuple desc Utils::SlotDesc slot_descs[] = { @@ -790,11 +863,8 @@ HdfsScannerContext* FileReaderTest::_create_file_struct_in_struct_prune_and_no_o TSlotDescriptor tslot = builder.build(); SlotDescriptor* new_slot = _pool.add(new SlotDescriptor(tslot)); (tupleDescriptor->slots())[1] = new_slot; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); + auto ctx = _create_scan_context(slot_descs, file_path); ctx->materialized_columns[1].decode_needed = false; - ctx->scan_range = (_create_scan_range(file_path)); return ctx; } @@ -982,9 +1052,8 @@ ChunkPtr FileReaderTest::_create_chunk_for_not_exist() { } TEST_F(FileReaderTest, TestInit) { - auto file = _create_file(_file1_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file1_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file1_path); + // init auto* ctx = _create_file1_base_context(); Status status = file_reader->init(ctx); @@ -992,9 +1061,8 @@ TEST_F(FileReaderTest, TestInit) { } TEST_F(FileReaderTest, TestGetNext) { - auto file = _create_file(_file1_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file1_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file1_path); + // init auto* ctx = _create_file1_base_context(); Status status = file_reader->init(ctx); @@ -1017,6 +1085,7 @@ TEST_F(FileReaderTest, TestGetNextWithSkipID) { auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), std::filesystem::file_size(_file1_path), _mock_datacache_options(), nullptr, &need_skip_rowids); + // init auto* ctx = _create_file1_base_context(); Status status = file_reader->init(ctx); @@ -1033,9 +1102,7 @@ TEST_F(FileReaderTest, TestGetNextWithSkipID) { } TEST_F(FileReaderTest, TestGetNextPartition) { - auto file = _create_file(_file1_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file1_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file1_path); // init auto* ctx = _create_context_for_partition(); Status status = file_reader->init(ctx); @@ -1052,9 +1119,7 @@ TEST_F(FileReaderTest, TestGetNextPartition) { } TEST_F(FileReaderTest, TestGetNextEmpty) { - auto file = _create_file(_file1_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file1_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file1_path); // init auto* ctx = _create_context_for_not_exist(); Status status = file_reader->init(ctx); @@ -1071,10 +1136,7 @@ TEST_F(FileReaderTest, TestGetNextEmpty) { } TEST_F(FileReaderTest, TestMinMaxConjunct) { - auto file = _create_file(_file2_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), std::filesystem::file_size(_file2_path), - _mock_datacache_options(), nullptr); + auto file_reader = _create_file_reader(_file2_path); // init auto* ctx = _create_context_for_min_max(); Status status = file_reader->init(ctx); @@ -1085,18 +1147,13 @@ TEST_F(FileReaderTest, TestMinMaxConjunct) { status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); ASSERT_EQ(11, chunk->num_rows()); - for (int i = 0; i < chunk->num_rows(); ++i) { - std::cout << "row" << i << ": " << chunk->debug_row(i) << std::endl; - } status = file_reader->get_next(&chunk); ASSERT_TRUE(status.is_end_of_file()); } TEST_F(FileReaderTest, TestFilterFile) { - auto file = _create_file(_file2_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file2_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file2_path); // init auto* ctx = _create_context_for_filter_file(); Status status = file_reader->init(ctx); @@ -1150,9 +1207,7 @@ TEST_F(FileReaderTest, TestGetNextDictFilter) { } TEST_F(FileReaderTest, TestGetNextOtherFilter) { - auto file = _create_file(_file2_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file2_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file2_path); // init auto* ctx = _create_context_for_other_filter(); Status status = file_reader->init(ctx); @@ -1168,18 +1223,13 @@ TEST_F(FileReaderTest, TestGetNextOtherFilter) { status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); ASSERT_EQ(6, chunk->num_rows()); - for (int i = 0; i < chunk->num_rows(); ++i) { - std::cout << "row" << i << ": " << chunk->debug_row(i) << std::endl; - } status = file_reader->get_next(&chunk); ASSERT_TRUE(status.is_end_of_file()); } TEST_F(FileReaderTest, TestSkipRowGroup) { - auto file = _create_file(_file2_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file2_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file2_path); // c1 > 10000 auto* ctx = _create_context_for_skip_group(); Status status = file_reader->init(ctx); @@ -1196,9 +1246,7 @@ TEST_F(FileReaderTest, TestSkipRowGroup) { } TEST_F(FileReaderTest, TestMultiFilterWithMultiPage) { - auto file = _create_file(_file3_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file3_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file3_path); // c3 = "c", c1 >= 4 auto* ctx = _create_context_for_multi_filter(); Status status = file_reader->init(ctx); @@ -1231,9 +1279,8 @@ TEST_F(FileReaderTest, TestMultiFilterWithMultiPage) { } TEST_F(FileReaderTest, TestOtherFilterWithMultiPage) { - auto file = _create_file(_file3_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file3_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file3_path); + // c1 >= 4080 auto* ctx = _create_context_for_late_materialization(); Status status = file_reader->init(ctx); @@ -1256,10 +1303,7 @@ TEST_F(FileReaderTest, TestOtherFilterWithMultiPage) { } TEST_F(FileReaderTest, TestReadStructUpperColumns) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); - ; + auto file_reader = _create_file_reader(_file4_path); // init auto* ctx = _create_context_for_struct_column(); @@ -1290,9 +1334,7 @@ TEST_F(FileReaderTest, TestReadStructUpperColumns) { } TEST_F(FileReaderTest, TestReadWithUpperPred) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // init auto* ctx = _create_context_for_upper_pred(); @@ -1316,9 +1358,7 @@ TEST_F(FileReaderTest, TestReadWithUpperPred) { } TEST_F(FileReaderTest, TestReadArray2dColumn) { - auto file = _create_file(_file5_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file5_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file5_path); //init auto* ctx = _create_file5_base_context(); @@ -1332,19 +1372,13 @@ TEST_F(FileReaderTest, TestReadArray2dColumn) { EXPECT_EQ(file_reader->_file_metadata->num_rows(), 5); - TypeDescriptor type_outer(LogicalType::TYPE_ARRAY); - type_outer.children.emplace_back(TYPE_INT_ARRAY_DESC); - ChunkPtr chunk = std::make_shared(); _append_column_for_chunk(LogicalType::TYPE_INT, &chunk); - auto c = ColumnHelper::create_column(type_outer, true); + auto c = ColumnHelper::create_column(TYPE_INT_ARRAY_ARRAY_DESC, true); chunk->append_column(c, chunk->num_columns()); status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); EXPECT_EQ(chunk->num_rows(), 5); - for (int i = 0; i < chunk->num_rows(); ++i) { - std::cout << "row" << i << ": " << chunk->debug_row(i) << std::endl; - } EXPECT_EQ(chunk->debug_row(0), "[1, [[1,2]]]"); EXPECT_EQ(chunk->debug_row(1), "[2, [[1,2],[3,4]]]"); EXPECT_EQ(chunk->debug_row(2), "[3, [[1,2,3],[4]]]"); @@ -1353,9 +1387,7 @@ TEST_F(FileReaderTest, TestReadArray2dColumn) { } TEST_F(FileReaderTest, TestReadRequiredArrayColumns) { - auto file = _create_file(_file6_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file6_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file6_path); // init auto* ctx = _create_file6_base_context(); @@ -1375,10 +1407,7 @@ TEST_F(FileReaderTest, TestReadRequiredArrayColumns) { // when key type is char or varchar, not string // the real type is BYTE_ARRAY which is OPTIONAL TEST_F(FileReaderTest, TestReadMapCharKeyColumn) { - auto file = _create_file(_file_map_char_key_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_map_char_key_path), - _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_map_char_key_path); //init auto* ctx = _create_file_map_char_key_context(); @@ -1394,19 +1423,12 @@ TEST_F(FileReaderTest, TestReadMapCharKeyColumn) { EXPECT_EQ(ranges.size(), 5); EXPECT_EQ(file_reader->_file_metadata->num_rows(), 1); - TypeDescriptor type_map_char(LogicalType::TYPE_MAP); - type_map_char.children.emplace_back(TYPE_CHAR_DESC); - type_map_char.children.emplace_back(TYPE_INT_DESC); - - TypeDescriptor type_map_varchar(LogicalType::TYPE_MAP); - type_map_varchar.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_varchar.children.emplace_back(TYPE_INT_DESC); ChunkPtr chunk = std::make_shared(); _append_column_for_chunk(LogicalType::TYPE_INT, &chunk); - auto c = ColumnHelper::create_column(type_map_char, true); + auto c = ColumnHelper::create_column(TYPE_CHAR_INT_MAP_DESC, true); chunk->append_column(c, chunk->num_columns()); - auto c_map1 = ColumnHelper::create_column(type_map_varchar, true); + auto c_map1 = ColumnHelper::create_column(TYPE_VARCHAR_INT_MAP_DESC, true); chunk->append_column(c_map1, chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -1416,10 +1438,7 @@ TEST_F(FileReaderTest, TestReadMapCharKeyColumn) { } TEST_F(FileReaderTest, TestReadMapColumn) { - auto file = _create_file(_file_map_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_map_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_map_path); //init auto* ctx = _create_file_map_base_context(); @@ -1435,25 +1454,16 @@ TEST_F(FileReaderTest, TestReadMapColumn) { EXPECT_EQ(ranges.size(), 8); EXPECT_EQ(file_reader->_file_metadata->num_rows(), 8); - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_INT_DESC); - TypeDescriptor type_map_map(LogicalType::TYPE_MAP); - type_map_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_map.children.emplace_back(type_map); - - TypeDescriptor type_map_array(LogicalType::TYPE_MAP); - type_map_array.children.emplace_back(TYPE_VARCHAR_DESC); - type_map_array.children.emplace_back(TYPE_INT_ARRAY_DESC); + TypeDescriptor type_map_map = TypeDescriptor::create_map_type(TYPE_VARCHAR_DESC, TYPE_VARCHAR_INT_MAP_DESC); ChunkPtr chunk = std::make_shared(); _append_column_for_chunk(LogicalType::TYPE_INT, &chunk); - auto c = ColumnHelper::create_column(type_map, true); + auto c = ColumnHelper::create_column(TYPE_VARCHAR_INT_MAP_DESC, true); chunk->append_column(c, chunk->num_columns()); auto c_map_map = ColumnHelper::create_column(type_map_map, true); chunk->append_column(c_map_map, chunk->num_columns()); - auto c_map_array = ColumnHelper::create_column(type_map_array, true); + auto c_map_array = ColumnHelper::create_column(TYPE_VARCHAR_INTARRAY_MAP_DESC, true); chunk->append_column(c_map_array, chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -1471,52 +1481,18 @@ TEST_F(FileReaderTest, TestReadMapColumn) { } TEST_F(FileReaderTest, TestReadStruct) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor c1 = TYPE_INT_DESC; - - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - // Test unordered field name - c2.children.emplace_back(TYPE_VARCHAR_DESC); - c2.field_names.emplace_back("f2"); - - c2.children.emplace_back(TYPE_INT_DESC); - c2.field_names.emplace_back("f1"); - - TypeDescriptor f3 = TYPE_INT_ARRAY_DESC; - - c2.children.emplace_back(f3); - c2.field_names.emplace_back("f3"); - - TypeDescriptor c3 = TYPE_VARCHAR_DESC; - - TypeDescriptor c4 = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - // start to build inner struct - TypeDescriptor c4_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - c4_struct.children.emplace_back(TYPE_INT_DESC); - c4_struct.field_names.emplace_back("e1"); - - c4_struct.children.emplace_back(TYPE_VARCHAR_DESC); - c4_struct.field_names.emplace_back("e2"); - // end to build inner struct - - c4.children.emplace_back(c4_struct); - - TypeDescriptor B1 = TYPE_VARCHAR_DESC; + TypeDescriptor c2 = TypeDescriptor::create_struct_type({"f2", "f1", "f3"}, + {TYPE_VARCHAR_DESC, TYPE_INT_DESC, TYPE_INT_ARRAY_DESC}); + TypeDescriptor c4_struct = TypeDescriptor::create_struct_type({"e1", "e2"}, {TYPE_INT_DESC, TYPE_VARCHAR_DESC}); + TypeDescriptor c4 = TypeDescriptor::create_array_type(c4_struct); Utils::SlotDesc slot_descs[] = { - {"c1", c1}, {"c2", c2}, {"c3", c3}, {"c4", c4}, {"B1", B1}, {""}, + {"c1", TYPE_INT_DESC}, {"c2", c2}, {"c3", TYPE_VARCHAR_DESC}, {"c4", c4}, {"B1", TYPE_VARCHAR_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); + auto ctx = _create_scan_context(slot_descs, _file4_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -1531,11 +1507,11 @@ TEST_F(FileReaderTest, TestReadStruct) { EXPECT_EQ(ranges.size(), 8); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(c3, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c4, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(B1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); @@ -1551,54 +1527,20 @@ TEST_F(FileReaderTest, TestReadStruct) { EXPECT_EQ("[7, {f2:'a',f1:7,f3:[7,8,9]}, 'a', [{e1:7,e2:'a'},{e1:8,e2:'a'}], 'A']", chunk->debug_row(7)); EXPECT_EQ("[8, {f2:'a',f1:8,f3:[8,9,10]}, 'a', [{e1:8,e2:'a'},{e1:9,e2:'a'}], 'A']", chunk->debug_row(8)); EXPECT_EQ("[9, {f2:'a',f1:9,f3:[9,10,11]}, 'a', [{e1:9,e2:'a'},{e1:10,e2:'a'}], 'A']", chunk->debug_row(9)); - - // for (int i = 0; i < 10; ++i) { - // std::cout << "row" << i << ": " << chunk->debug_row(i) << std::endl; - // } } TEST_F(FileReaderTest, TestReadStructSubField) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor c1 = TYPE_INT_DESC; - - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - - c2.children.emplace_back(TYPE_INT_DESC); - c2.field_names.emplace_back("f1"); - - TypeDescriptor f3 = TYPE_INT_ARRAY_DESC; - - c2.children.emplace_back(f3); - c2.field_names.emplace_back("f3"); - - TypeDescriptor c3 = TYPE_VARCHAR_DESC; - - TypeDescriptor c4 = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - // start to build inner struct - // dont't load subfield e1 - TypeDescriptor c4_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - c4_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - c4_struct.field_names.emplace_back("e2"); - // end to build inner struct - - c4.children.emplace_back(c4_struct); - - TypeDescriptor B1 = TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR); + TypeDescriptor c2 = TypeDescriptor::create_struct_type({"f1", "f3"}, {TYPE_INT_DESC, TYPE_INT_ARRAY_DESC}); + TypeDescriptor c4_struct = TypeDescriptor::create_struct_type({"e2"}, {TYPE_VARCHAR_DESC}); + TypeDescriptor c4 = TypeDescriptor::create_array_type(c4_struct); Utils::SlotDesc slot_descs[] = { - {"c1", c1}, {"c2", c2}, {"c3", c3}, {"c4", c4}, {"B1", B1}, {""}, + {"c1", TYPE_INT_DESC}, {"c2", c2}, {"c3", TYPE_VARCHAR_DESC}, {"c4", c4}, {"B1", TYPE_VARCHAR_DESC}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); + auto ctx = _create_scan_context(slot_descs, _file4_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -1613,11 +1555,11 @@ TEST_F(FileReaderTest, TestReadStructSubField) { EXPECT_EQ(ranges.size(), 6); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(c3, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c4, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(B1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); @@ -1640,40 +1582,18 @@ TEST_F(FileReaderTest, TestReadStructSubField) { } TEST_F(FileReaderTest, TestReadStructAbsentSubField) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor c1 = TYPE_INT_DESC; - - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - - c2.children.emplace_back(TYPE_INT_DESC); - c2.field_names.emplace_back("f1"); - - c2.children.emplace_back(TYPE_VARCHAR_DESC); - c2.field_names.emplace_back("f2"); - - TypeDescriptor f3 = TYPE_INT_ARRAY_DESC; - - c2.children.emplace_back(f3); - c2.field_names.emplace_back("f3"); - - c2.children.emplace_back(TYPE_VARCHAR_DESC); - c2.field_names.emplace_back("not_existed"); - + TypeDescriptor c2 = TypeDescriptor::create_struct_type( + {"f1", "f2", "f3", "not_existed"}, + {TYPE_INT_DESC, TYPE_VARCHAR_DESC, TYPE_INT_ARRAY_DESC, TYPE_VARCHAR_DESC}); Utils::SlotDesc slot_descs[] = { - {"c1", c1}, + {"c1", TYPE_INT_DESC}, {"c2", c2}, {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); + auto ctx = _create_scan_context(slot_descs, _file4_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -1682,7 +1602,7 @@ TEST_F(FileReaderTest, TestReadStructAbsentSubField) { EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -1697,45 +1617,23 @@ TEST_F(FileReaderTest, TestReadStructAbsentSubField) { } TEST_F(FileReaderTest, TestReadStructCaseSensitive) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor c1 = TYPE_INT_DESC; - - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - - c2.children.emplace_back(TYPE_INT_DESC); - c2.field_names.emplace_back("F1"); + TypeDescriptor c2 = TypeDescriptor::create_struct_type({"F1", "F2", "F3"}, + {TYPE_INT_DESC, TYPE_VARCHAR_DESC, TYPE_INT_ARRAY_DESC}); - c2.children.emplace_back(TYPE_VARCHAR_DESC); - c2.field_names.emplace_back("F2"); - - TypeDescriptor f3 = TYPE_INT_ARRAY_DESC; - - c2.children.emplace_back(f3); - c2.field_names.emplace_back("F3"); - - Utils::SlotDesc slot_descs[] = {{"c1", c1}, {"c2", c2}, {""}}; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); + Utils::SlotDesc slot_descs[] = {{"c1", TYPE_INT_DESC}, {"c2", c2}, {""}}; + auto ctx = _create_scan_context(slot_descs, _file4_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); - if (!status.ok()) { - std::cout << status.message() << std::endl; - } ASSERT_TRUE(status.ok()); EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -1743,110 +1641,54 @@ TEST_F(FileReaderTest, TestReadStructCaseSensitive) { ASSERT_EQ(1024, chunk->num_rows()); EXPECT_EQ("[0, {F1:0,F2:'a',F3:[0,1,2]}]", chunk->debug_row(0)); - - // for (int i = 0; i < 1; ++i) { - // std::cout << "row" << i << ": " << chunk->debug_row(i) << std::endl; - // } } TEST_F(FileReaderTest, TestReadStructCaseSensitiveError) { - auto file = _create_file(_file4_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file4_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file4_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - ctx->case_sensitive = true; - - TypeDescriptor c1 = TYPE_INT_DESC; + TypeDescriptor c2 = TypeDescriptor::create_struct_type({"F1", "F2", "F3"}, + {TYPE_INT_DESC, TYPE_VARCHAR_DESC, TYPE_INT_ARRAY_DESC}); - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - - c2.children.emplace_back(TYPE_INT_DESC); - c2.field_names.emplace_back("F1"); - - c2.children.emplace_back(TYPE_VARCHAR_DESC); - c2.field_names.emplace_back("F2"); - - TypeDescriptor f3 = TYPE_INT_ARRAY_DESC; - - c2.children.emplace_back(f3); - c2.field_names.emplace_back("F3"); - - Utils::SlotDesc slot_descs[] = {{"c1", c1}, {"c2", c2}, {""}}; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file4_path)); + Utils::SlotDesc slot_descs[] = {{"c1", TYPE_INT_DESC}, {"c2", c2}, {""}}; + auto ctx = _create_scan_context(slot_descs, _file4_path); + ctx->case_sensitive = true; // --------------finish init context--------------- - Status status = file_reader->init(ctx); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(file_reader->row_group_size(), 1); + ASSERT_OK(file_reader->init(ctx)); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); ASSERT_EQ(1024, chunk->num_rows()); EXPECT_EQ("[0, NULL]", chunk->debug_row(0)); } TEST_F(FileReaderTest, TestReadStructNull) { - auto file = _create_file(_file_struct_null_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_struct_null_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_struct_null_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - ctx->case_sensitive = false; - - TypeDescriptor c0 = TYPE_INT_DESC; - - TypeDescriptor c1 = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - - c1.children.emplace_back(TYPE_INT_DESC); - c1.field_names.emplace_back("c1_0"); - - c1.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY)); - c1.field_names.emplace_back("c1_1"); - - c1.children.at(1).children.emplace_back(TYPE_INT_DESC); + TypeDescriptor c1 = TypeDescriptor::create_struct_type({"c1_0", "c1_1"}, {TYPE_INT_DESC, TYPE_INT_ARRAY_DESC}); + TypeDescriptor c2_struct = TypeDescriptor::create_struct_type({"c2_0", "c2_1"}, {TYPE_INT_DESC, TYPE_INT_DESC}); + TypeDescriptor c2 = TypeDescriptor::create_array_type(c2_struct); - TypeDescriptor c2 = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - TypeDescriptor c2_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - c2_struct.children.emplace_back(TYPE_INT_DESC); - c2_struct.children.emplace_back(TYPE_INT_DESC); - c2_struct.field_names.emplace_back("c2_0"); - c2_struct.field_names.emplace_back("c2_1"); - c2.children.emplace_back(c2_struct); + Utils::SlotDesc slot_descs[] = {{"c0", TYPE_INT_DESC}, {"c1", c1}, {"c2", c2}, {""}}; - Utils::SlotDesc slot_descs[] = {{"c0", c0}, {"c1", c1}, {"c2", c2}, {""}}; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_struct_null_path)); + auto ctx = _create_scan_context(slot_descs, _file_struct_null_path); // --------------finish init context--------------- - Status status = file_reader->init(ctx); - if (!status.ok()) { - std::cout << status.message() << std::endl; - } - ASSERT_TRUE(status.ok()); - - EXPECT_EQ(file_reader->row_group_size(), 1); + ASSERT_OK(file_reader->init(ctx)); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(c0, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c1, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(c2, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); ASSERT_EQ(4, chunk->num_rows()); EXPECT_EQ("[1, {c1_0:1,c1_1:[1,2,3]}, [{c2_0:1,c2_1:1},{c2_0:2,c2_1:2},{c2_0:3,c2_1:3}]]", chunk->debug_row(0)); @@ -1858,40 +1700,22 @@ TEST_F(FileReaderTest, TestReadStructNull) { } TEST_F(FileReaderTest, TestReadBinary) { - auto file = _create_file(_file_binary_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_binary_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_binary_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - ctx->case_sensitive = false; - - TypeDescriptor k1 = TYPE_INT_DESC; - TypeDescriptor k2 = TYPE_VARBINARY_DESC; - - Utils::SlotDesc slot_descs[] = {{"k1", k1}, {"k2", k2}, {""}}; + Utils::SlotDesc slot_descs[] = {{"k1", TYPE_INT_DESC}, {"k2", TYPE_VARBINARY_DESC}, {""}}; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_binary_path)); + auto ctx = _create_scan_context(slot_descs, _file_binary_path); // --------------finish init context--------------- - Status status = file_reader->init(ctx); - if (!status.ok()) { - std::cout << status.message() << std::endl; - } - ASSERT_TRUE(status.ok()); - - EXPECT_EQ(file_reader->row_group_size(), 1); + ASSERT_OK(file_reader->init(ctx)); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(k1, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(k2, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARBINARY_DESC, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); ASSERT_EQ(1, chunk->num_rows()); std::string s = chunk->debug_row(0); @@ -1899,10 +1723,7 @@ TEST_F(FileReaderTest, TestReadBinary) { } TEST_F(FileReaderTest, TestReadMapColumnWithPartialMaterialize) { - auto file = _create_file(_file_map_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_map_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_map_path); //init auto* ctx = _create_file_map_partial_materialize_context(); @@ -1920,23 +1741,13 @@ TEST_F(FileReaderTest, TestReadMapColumnWithPartialMaterialize) { EXPECT_EQ(ranges.size(), 4); EXPECT_EQ(file_reader->_file_metadata->num_rows(), 8); - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_UNKNOWN_DESC); - - TypeDescriptor type_map_map(LogicalType::TYPE_MAP); - type_map_map.children.emplace_back(TYPE_UNKNOWN_DESC); - type_map_map.children.emplace_back(type_map); - TypeDescriptor type_array(LogicalType::TYPE_ARRAY); - type_array.children.emplace_back(TYPE_INT_DESC); - TypeDescriptor type_map_array(LogicalType::TYPE_MAP); - type_map_array.children.emplace_back(TYPE_UNKNOWN_DESC); - type_map_array.children.emplace_back(type_array); + TypeDescriptor type_map_map = TypeDescriptor::create_map_type(TYPE_UNKNOWN_DESC, TYPE_VARCHAR_UNKNOWN_MAP_DESC); + TypeDescriptor type_map_array = TypeDescriptor::create_map_type(TYPE_UNKNOWN_DESC, TYPE_INT_ARRAY_DESC); ChunkPtr chunk = std::make_shared(); _append_column_for_chunk(LogicalType::TYPE_INT, &chunk); - auto c = ColumnHelper::create_column(type_map, true); + auto c = ColumnHelper::create_column(TYPE_VARCHAR_UNKNOWN_MAP_DESC, true); chunk->append_column(c, chunk->num_columns()); auto c_map_map = ColumnHelper::create_column(type_map_map, true); chunk->append_column(c_map_map, chunk->num_columns()); @@ -1960,48 +1771,27 @@ TEST_F(FileReaderTest, TestReadMapColumnWithPartialMaterialize) { } TEST_F(FileReaderTest, TestReadNotNull) { - auto file = _create_file(_file_col_not_null_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_col_not_null_path), - _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_col_not_null_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_INT_DESC); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_struct.field_names.emplace_back("a"); - - type_struct.children.emplace_back(TYPE_INT_DESC); - type_struct.field_names.emplace_back("b"); - + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"a", "b"}, {TYPE_VARCHAR_DESC, TYPE_INT_DESC}); Utils::SlotDesc slot_descs[] = { - {"col_int", type_int}, - {"col_map", type_map}, + {"col_int", TYPE_INT_DESC}, + {"col_map", TYPE_VARCHAR_INT_MAP_DESC}, {"col_struct", type_struct}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, _file_col_not_null_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); - EXPECT_EQ(file_reader->_row_group_readers.size(), 1); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_map, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_INT_MAP_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -2017,38 +1807,25 @@ TEST_F(FileReaderTest, TestTwoNestedLevelArray) { // format: // id: INT, b: ARRAY> const std::string filepath = "./be/test/exec/test_data/parquet_data/two_level_nested_array.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_array = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - type_array.children.emplace_back(TYPE_INT_ARRAY_DESC); - + TypeDescriptor type_array = TypeDescriptor::create_array_type(TYPE_INT_ARRAY_DESC); Utils::SlotDesc slot_descs[] = { - {"id", type_int}, + {"id", TYPE_INT_DESC}, {"b", type_array}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); - EXPECT_EQ(file_reader->_row_group_readers.size(), 1); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_array, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -2078,43 +1855,27 @@ TEST_F(FileReaderTest, TestTwoNestedLevelArray) { } TEST_F(FileReaderTest, TestReadMapNull) { - auto file = _create_file(_file_map_null_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_map_null_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_map_null_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARCHAR_DESC); - type_map.children.emplace_back(TYPE_INT_DESC); - Utils::SlotDesc slot_descs[] = { - {"uuid", type_int}, - {"c1", type_map}, + {"uuid", TYPE_INT_DESC}, + {"c1", TYPE_VARCHAR_INT_MAP_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_map_null_path)); + auto ctx = _create_scan_context(slot_descs, _file_map_null_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); - EXPECT_EQ(file_reader->_row_group_readers.size(), 1); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_map, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_INT_MAP_DESC, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); ASSERT_EQ(3, chunk->num_rows()); EXPECT_EQ("[1, NULL]", chunk->debug_row(0)); @@ -2133,33 +1894,16 @@ TEST_F(FileReaderTest, TestReadArrayMap) { // } // } - auto file = _create_file(_file_array_map_path); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_array_map_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_array_map_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_string = TYPE_VARCHAR_DESC; - - TypeDescriptor type_map(LogicalType::TYPE_MAP); - type_map.children.emplace_back(TYPE_VARBINARY_DESC); - type_map.children.emplace_back(TYPE_INT_DESC); - - TypeDescriptor type_array_map(LogicalType::TYPE_ARRAY); - type_array_map.children.emplace_back(type_map); - + TypeDescriptor type_array_map = TypeDescriptor::create_array_type(TYPE_VARBINARY_INT_MAP_DESC); Utils::SlotDesc slot_descs[] = { - {"uuid", type_string}, + {"uuid", TYPE_VARCHAR_DESC}, {"col_array_map", type_array_map}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_array_map_path)); + auto ctx = _create_scan_context(slot_descs, _file_array_map_path); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -2201,42 +1945,20 @@ TEST_F(FileReaderTest, TestStructArrayNull) { // With config's vector chunk size { - auto file = _create_file(filepath); - auto file_reader = std::make_shared( - config::vector_chunk_size, file.get(), std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_INT_DESC); - type_struct.field_names.emplace_back("a"); - - TypeDescriptor type_array = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - TypeDescriptor type_array_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_array_struct.children.emplace_back(TYPE_INT_DESC); - type_array_struct.field_names.emplace_back("c"); - type_array_struct.children.emplace_back(TYPE_VARBINARY_DESC); - type_array_struct.field_names.emplace_back("d"); - - type_array.children.emplace_back(type_array_struct); - - type_struct.children.emplace_back(type_array); - type_struct.field_names.emplace_back("b"); + TypeDescriptor type_array_struct = + TypeDescriptor::create_struct_type({"c", "d"}, {TYPE_INT_DESC, TYPE_VARBINARY_DESC}); + TypeDescriptor type_array = TypeDescriptor::create_array_type(type_array_struct); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"a", "b"}, {TYPE_INT_DESC, type_array}); Utils::SlotDesc slot_descs[] = { - {"id", type_int}, + {"id", TYPE_INT_DESC}, {"col", type_struct}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -2245,7 +1967,7 @@ TEST_F(FileReaderTest, TestStructArrayNull) { EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -2277,42 +1999,20 @@ TEST_F(FileReaderTest, TestStructArrayNull) { // With 1024 chunk size { - auto file = _create_file(filepath); - auto file_reader = std::make_shared(1024, file.get(), std::filesystem::file_size(filepath), - _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath, 1024); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_INT_DESC); - type_struct.field_names.emplace_back("a"); - - TypeDescriptor type_array = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - TypeDescriptor type_array_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_array_struct.children.emplace_back(TYPE_INT_DESC); - type_array_struct.field_names.emplace_back("c"); - type_array_struct.children.emplace_back(TYPE_VARCHAR_DESC); - type_array_struct.field_names.emplace_back("d"); - - type_array.children.emplace_back(type_array_struct); - - type_struct.children.emplace_back(type_array); - type_struct.field_names.emplace_back("b"); + TypeDescriptor type_array_struct = + TypeDescriptor::create_struct_type({"c", "d"}, {TYPE_INT_DESC, TYPE_VARCHAR_DESC}); + TypeDescriptor type_array = TypeDescriptor::create_array_type(type_array_struct); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"a", "b"}, {TYPE_INT_DESC, type_array}); Utils::SlotDesc slot_descs[] = { - {"id", type_int}, + {"id", TYPE_INT_DESC}, {"col", type_struct}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -2321,7 +2021,7 @@ TEST_F(FileReaderTest, TestStructArrayNull) { EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -2369,42 +2069,20 @@ TEST_F(FileReaderTest, TestComplexTypeNotNull) { // } std::string filepath = "./be/test/exec/test_data/parquet_data/complex_subfield_not_null.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_int = TYPE_INT_DESC; - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TYPE_INT_DESC); - type_struct.field_names.emplace_back("a"); - - TypeDescriptor type_array = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - TypeDescriptor type_array_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_array_struct.children.emplace_back(TYPE_INT_DESC); - type_array_struct.field_names.emplace_back("c"); - type_array_struct.children.emplace_back(TYPE_VARBINARY_DESC); - type_array_struct.field_names.emplace_back("d"); - - type_array.children.emplace_back(type_array_struct); - - type_struct.children.emplace_back(type_array); - type_struct.field_names.emplace_back("b"); + TypeDescriptor type_array_struct = + TypeDescriptor::create_struct_type({"c", "d"}, {TYPE_INT_DESC, TYPE_VARBINARY_DESC}); + TypeDescriptor type_array = TypeDescriptor::create_array_type(type_array_struct); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"a", "b"}, {TYPE_INT_DESC, type_array}); Utils::SlotDesc slot_descs[] = { - {"id", type_int}, + {"id", TYPE_INT_DESC}, {"col", type_struct}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -2413,7 +2091,7 @@ TEST_F(FileReaderTest, TestComplexTypeNotNull) { EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); status = file_reader->get_next(&chunk); @@ -2446,29 +2124,15 @@ TEST_F(FileReaderTest, TestHudiMORTwoNestedLevelArray) { // b: varchar // c: ARRAY> const std::string filepath = "./be/test/exec/test_data/parquet_data/hudi_mor_two_level_nested_array.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_string = TYPE_VARCHAR_DESC; - - TypeDescriptor type_array = TypeDescriptor::from_logical_type(LogicalType::TYPE_ARRAY); - - type_array.children.emplace_back(TYPE_INT_ARRAY_DESC); - Utils::SlotDesc slot_descs[] = { - {"b", type_string}, - {"c", type_array}, + {"b", TYPE_VARCHAR_DESC}, + {"c", TYPE_INT_ARRAY_ARRAY_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- Status status = file_reader->init(ctx); @@ -2479,8 +2143,8 @@ TEST_F(FileReaderTest, TestHudiMORTwoNestedLevelArray) { EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_string, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_array, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_ARRAY_ARRAY_DESC, true), chunk->num_columns()); status = file_reader->get_next(&chunk); ASSERT_TRUE(status.ok()); @@ -2508,36 +2172,19 @@ TEST_F(FileReaderTest, TestLateMaterializationAboutRequiredComplexType) { // } // } const std::string filepath = "./be/test/formats/parquet/test_data/map_struct_subfield_required.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_a = TYPE_INT_DESC; - - TypeDescriptor type_b = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_b.children.emplace_back(TYPE_INT_DESC); - type_b.field_names.emplace_back("b1"); - type_b.children.emplace_back(TYPE_INT_DESC); - type_b.field_names.emplace_back("b2"); - - TypeDescriptor type_c = TypeDescriptor::from_logical_type(LogicalType::TYPE_MAP); - type_c.children.emplace_back(TYPE_INT_DESC); - type_c.children.emplace_back(TYPE_INT_DESC); + TypeDescriptor type_b = TypeDescriptor::create_struct_type({"b1", "b2"}, {TYPE_INT_DESC, TYPE_INT_DESC}); Utils::SlotDesc slot_descs[] = { - {"a", type_a}, + {"a", TYPE_INT_DESC}, {"b", type_b}, - {"c", type_c}, + {"c", TYPE_INT_INT_MAP_DESC}, {""}, - }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(filepath)); + }; + + auto ctx = _create_scan_context(slot_descs, filepath); _create_int_conjunct_ctxs(TExprOpcode::EQ, 0, 8000, &ctx->conjunct_ctxs_by_slot[0]); // --------------finish init context--------------- @@ -2548,9 +2195,9 @@ TEST_F(FileReaderTest, TestLateMaterializationAboutRequiredComplexType) { EXPECT_EQ(file_reader->row_group_size(), 3); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_a, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_b, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_c, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_INT_MAP_DESC, true), chunk->num_columns()); ASSERT_EQ(1, file_reader->_row_group_readers[0]->_left_conjunct_ctxs.size()); const auto& conjunct_ctxs_by_slot = file_reader->_row_group_readers[0]->_param.conjunct_ctxs_by_slot; @@ -2590,36 +2237,18 @@ TEST_F(FileReaderTest, TestLateMaterializationAboutOptionalComplexType) { // } // } const std::string filepath = "./be/test/formats/parquet/test_data/map_struct_subfield_optional.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - - TypeDescriptor type_a = TYPE_INT_DESC; - - TypeDescriptor type_b = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_b.children.emplace_back(TYPE_INT_DESC); - type_b.field_names.emplace_back("b1"); - type_b.children.emplace_back(TYPE_INT_DESC); - type_b.field_names.emplace_back("b2"); - - TypeDescriptor type_c = TypeDescriptor::from_logical_type(LogicalType::TYPE_MAP); - type_c.children.emplace_back(TYPE_INT_DESC); - type_c.children.emplace_back(TYPE_INT_DESC); + TypeDescriptor type_b = TypeDescriptor::create_struct_type({"b1", "b2"}, {TYPE_INT_DESC, TYPE_INT_DESC}); Utils::SlotDesc slot_descs[] = { - {"a", type_a}, + {"a", TYPE_INT_DESC}, {"b", type_b}, - {"c", type_c}, + {"c", TYPE_INT_INT_MAP_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(filepath)); + auto ctx = _create_scan_context(slot_descs, filepath); _create_int_conjunct_ctxs(TExprOpcode::EQ, 0, 8000, &ctx->conjunct_ctxs_by_slot[0]); // --------------finish init context--------------- @@ -2630,9 +2259,9 @@ TEST_F(FileReaderTest, TestLateMaterializationAboutOptionalComplexType) { EXPECT_EQ(file_reader->row_group_size(), 3); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_a, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_b, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_c, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_INT_MAP_DESC, true), chunk->num_columns()); ASSERT_EQ(1, file_reader->_row_group_readers[0]->_left_conjunct_ctxs.size()); const auto& conjunct_ctxs_by_slot = file_reader->_row_group_readers[0]->_param.conjunct_ctxs_by_slot; @@ -2654,13 +2283,9 @@ TEST_F(FileReaderTest, TestLateMaterializationAboutOptionalComplexType) { TEST_F(FileReaderTest, CheckDictOutofBouds) { const std::string filepath = "./be/test/exec/test_data/parquet_scanner/type_mismatch_decode_min_max.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - TypeDescriptor type_vin = TYPE_VARCHAR_DESC; TypeDescriptor type_log_domain = TYPE_VARCHAR_DESC; TypeDescriptor type_file_name = TYPE_VARCHAR_DESC; @@ -2700,10 +2325,7 @@ TEST_F(FileReaderTest, CheckDictOutofBouds) { {""}, }; - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(filepath)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- @@ -2744,23 +2366,15 @@ TEST_F(FileReaderTest, CheckDictOutofBouds) { TEST_F(FileReaderTest, CheckLargeParquetHeader) { const std::string filepath = "./be/test/formats/parquet/test_data/large_page_header.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = { {"myString", TYPE_VARCHAR_DESC}, {"myInteger", TYPE_INT_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(filepath)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- @@ -2797,13 +2411,9 @@ TEST_F(FileReaderTest, TestMinMaxForIcebergTable) { // } const std::string filepath = "./be/test/formats/parquet/test_data/iceberg_schema_evolution/iceberg_string_map_string.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); + auto file_reader = _create_file_reader(filepath); // --------------init context--------------- - auto ctx = _create_scan_context(); - TIcebergSchema schema = TIcebergSchema{}; TIcebergSchemaField field_data{}; @@ -2831,35 +2441,22 @@ TEST_F(FileReaderTest, TestMinMaxForIcebergTable) { std::vector fields{field_data, field_struct, field_int}; schema.__set_fields(fields); - ctx->iceberg_schema = &schema; - - TypeDescriptor type_data = TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct.field_names.emplace_back("x"); - type_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct.field_names.emplace_back("y"); - TypeDescriptor type_int = TypeDescriptor::from_logical_type(LogicalType::TYPE_INT); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"x", "y"}, {TYPE_VARCHAR_DESC, TYPE_VARCHAR_DESC}); Utils::SlotDesc slot_descs[] = { - {"data", type_data, 0}, + {"data", TYPE_VARCHAR_DESC, 0}, {"struct", type_struct, 1}, - {"int", type_int, 2}, + {"int", TYPE_INT_DESC, 2}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(filepath)); - Utils::SlotDesc min_max_slots[] = { - {"int", TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), 2}, + {"int", TYPE_INT_DESC, 2}, {""}, }; - ctx->min_max_tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, min_max_slots); + auto ctx = _create_scan_context(slot_descs, min_max_slots, filepath); + ctx->iceberg_schema = &schema; + std::vector t_conjuncts; ParquetUTBase::append_int_conjunct(TExprOpcode::GE, 2, 5, &t_conjuncts); ParquetUTBase::append_int_conjunct(TExprOpcode::LE, 2, 5, &t_conjuncts); @@ -2870,12 +2467,12 @@ TEST_F(FileReaderTest, TestMinMaxForIcebergTable) { Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); - EXPECT_EQ(file_reader->_row_group_readers.size(), 1); + ASSERT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(type_data, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_int, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); size_t total_row_nums = 0; while (!status.is_end_of_file()) { @@ -2895,18 +2492,11 @@ TEST_F(FileReaderTest, TestRandomReadWith2PageSize) { std::random_device rd; std::mt19937 rng(rd()); - TypeDescriptor type_array(LogicalType::TYPE_ARRAY); - type_array.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT)); - auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); - chunk->append_column( - ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR), true), - chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_array, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_ARRAY_DESC, true), chunk->num_columns()); // c0 = np.arange(1, 20001) // c1 = np.arange(20000, 0, -1) @@ -2938,6 +2528,9 @@ TEST_F(FileReaderTest, TestRandomReadWith2PageSize) { // }) const std::string big_page_file = "./be/test/formats/parquet/test_data/read_range_big_page_test.parquet"; + Utils::SlotDesc slot_descs[] = { + {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}}; + // for small page 1000 values / page // for big page 10000 values / page for (size_t index = 0; index < 2; index++) { @@ -2966,8 +2559,7 @@ TEST_F(FileReaderTest, TestRandomReadWith2PageSize) { int32_t num = index == 0 ? dist_small(rng) : dist_big(rng); in_oprands.emplace(num); } - auto ctx = _create_file_random_read_context(file_path); - auto file = _create_file(file_path); + auto ctx = _create_file_random_read_context(file_path, slot_descs); ctx->conjunct_ctxs_by_slot[0].clear(); std::vector t_conjuncts; ParquetUTBase::create_in_predicate_int_conjunct_ctxs(TExprOpcode::FILTER_IN, 0, in_oprands, @@ -2975,8 +2567,7 @@ TEST_F(FileReaderTest, TestRandomReadWith2PageSize) { ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &ctx->conjunct_ctxs_by_slot[0]); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(file_path)); + auto file_reader = _create_file_reader(file_path); Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); @@ -3058,34 +2649,19 @@ TEST_F(FileReaderTest, TestStructSubfieldDictFilter) { "./be/test/formats/parquet/test_data/test_parquet_struct_in_struct.parquet"; auto ctx = _create_file_struct_in_struct_read_context(struct_in_struct_file_path); - auto file = _create_file(struct_in_struct_file_path); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct.field_names.emplace_back("c0"); - - type_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct.field_names.emplace_back("c1"); - - TypeDescriptor type_struct_in_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct_in_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct_in_struct.field_names.emplace_back("c0"); - - type_struct_in_struct.children.emplace_back(type_struct); - type_struct_in_struct.field_names.emplace_back("c_struct"); + TypeDescriptor type_struct = + TypeDescriptor::create_struct_type({"c0", "c1"}, {TYPE_VARCHAR_DESC, TYPE_VARCHAR_DESC}); + TypeDescriptor type_struct_in_struct = + TypeDescriptor::create_struct_type({"c0", "c_struct"}, {TYPE_VARCHAR_DESC, type_struct}); std::vector subfield_path({"c_struct", "c0"}); - _create_struct_subfield_predicate_conjunct_ctxs(TExprOpcode::EQ, 3, type_struct_in_struct, subfield_path, "55", &ctx->conjunct_ctxs_by_slot[3]); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(struct_in_struct_file_path)); + auto file_reader = _create_file_reader(struct_in_struct_file_path); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(type_struct_in_struct, true), chunk->num_columns()); @@ -3205,27 +2781,22 @@ TEST_F(FileReaderTest, TestReadRoundByRound) { // "c3": df.apply(lambda x: pd.NA if x["c0"] % 10 == 0 else [x["c0"] % 1000, pd.NA, x["c1"] % 1000], axis = 1) // }) const std::string file_path = "./be/test/formats/parquet/test_data/read_range_big_page_test.parquet"; - TypeDescriptor type_array(LogicalType::TYPE_ARRAY); - type_array.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT)); auto chunk = std::make_shared(); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), - chunk->num_columns()); - chunk->append_column( - ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR), true), - chunk->num_columns()); - chunk->append_column(ColumnHelper::create_column(type_array, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); + chunk->append_column(ColumnHelper::create_column(TYPE_INT_ARRAY_DESC, true), chunk->num_columns()); - auto ctx = _create_file_random_read_context(file_path); - auto file = _create_file(file_path); + Utils::SlotDesc slot_descs[] = { + {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}}; + + auto ctx = _create_file_random_read_context(file_path, slot_descs); // c0 >= 100 _create_int_conjunct_ctxs(TExprOpcode::GE, 0, 100, &ctx->conjunct_ctxs_by_slot[0]); // c1 <= 100 _create_int_conjunct_ctxs(TExprOpcode::LE, 1, 100, &ctx->conjunct_ctxs_by_slot[1]); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(file_path), _mock_datacache_options()); + auto file_reader = _create_file_reader(file_path); Status status = file_reader->init(ctx); ASSERT_TRUE(status.ok()); size_t total_row_nums = 0; @@ -3244,22 +2815,13 @@ TEST_F(FileReaderTest, TestStructSubfieldNoDecodeNotOutput) { "./be/test/formats/parquet/test_data/test_parquet_struct_in_struct.parquet"; auto ctx = _create_file_struct_in_struct_prune_and_no_output_read_context(struct_in_struct_file_path); - auto file = _create_file(struct_in_struct_file_path); - - TypeDescriptor type_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct.children.emplace_back(TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)); - type_struct.field_names.emplace_back("c0"); - - TypeDescriptor type_struct_in_struct = TypeDescriptor::from_logical_type(LogicalType::TYPE_STRUCT); - type_struct_in_struct.children.emplace_back(type_struct); - type_struct_in_struct.field_names.emplace_back("c_struct"); + TypeDescriptor type_struct = TypeDescriptor::create_struct_type({"c0"}, {TYPE_VARCHAR_DESC}); + TypeDescriptor type_struct_in_struct = TypeDescriptor::create_struct_type({"c_struct"}, {type_struct}); std::vector subfield_path({"c_struct", "c0"}); - _create_struct_subfield_predicate_conjunct_ctxs(TExprOpcode::EQ, 1, type_struct_in_struct, subfield_path, "55", &ctx->conjunct_ctxs_by_slot[1]); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(struct_in_struct_file_path)); + auto file_reader = _create_file_reader(struct_in_struct_file_path); auto chunk = std::make_shared(); chunk->append_column(ColumnHelper::create_column(TypeDescriptor::from_logical_type(LogicalType::TYPE_INT), true), @@ -3329,36 +2891,26 @@ TEST_F(FileReaderTest, TestTime) { // format: // id: INT, b: TIME const std::string filepath = "./be/test/formats/parquet/test_data/test_parquet_time_type.parquet"; - auto file = _create_file(filepath); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(filepath), _mock_datacache_options()); - // --------------init context--------------- - auto ctx = _create_scan_context(); + auto file_reader = _create_file_reader(filepath); + // --------------init context--------------- Utils::SlotDesc slot_descs[] = { {"c1", TYPE_INT_DESC}, {"c2", TYPE_TIME_DESC}, {""}, }; - - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_col_not_null_path)); + auto ctx = _create_scan_context(slot_descs, filepath); // --------------finish init context--------------- - Status status = file_reader->init(ctx); - ASSERT_TRUE(status.ok()); - + ASSERT_OK(file_reader->init(ctx)); EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); chunk->append_column(ColumnHelper::create_column(TYPE_INT_DESC, true), chunk->num_columns()); chunk->append_column(ColumnHelper::create_column(TYPE_TIME_DESC, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); chunk->check_or_die(); @@ -3371,6 +2923,7 @@ TEST_F(FileReaderTest, TestTime) { total_row_nums += chunk->num_rows(); { + Status status; while (!status.is_end_of_file()) { chunk->reset(); status = file_reader->get_next(&chunk); @@ -3383,30 +2936,15 @@ TEST_F(FileReaderTest, TestTime) { } TEST_F(FileReaderTest, TestReadNoMinMaxStatistics) { - auto file = _create_file(_file_no_min_max_stats_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file_no_min_max_stats_path), - _mock_datacache_options()); + auto file_reader = _create_file_reader(_file_no_min_max_stats_path); // --------------init context--------------- - auto ctx = _create_scan_context(); - Utils::SlotDesc slot_descs[] = { {"attr_value", TYPE_VARCHAR_DESC}, {""}, }; + auto ctx = _create_scan_context(slot_descs, slot_descs, _file_no_min_max_stats_path); - TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); - Utils::make_column_info_vector(tuple_desc, &ctx->materialized_columns); - ctx->slot_descs = tuple_desc->slots(); - ctx->scan_range = (_create_scan_range(_file_no_min_max_stats_path)); - - // create min max conjuncts - Utils::SlotDesc min_max_slots[] = { - {"attr_value", TypeDescriptor::from_logical_type(LogicalType::TYPE_VARCHAR)}, - {""}, - }; - ctx->min_max_tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, min_max_slots); std::vector t_conjuncts; ParquetUTBase::append_string_conjunct(TExprOpcode::GE, 0, "2", &t_conjuncts); ParquetUTBase::append_string_conjunct(TExprOpcode::LE, 0, "2", &t_conjuncts); @@ -3416,16 +2954,14 @@ TEST_F(FileReaderTest, TestReadNoMinMaxStatistics) { _create_string_conjunct_ctxs(TExprOpcode::EQ, 0, "2", &ctx->conjunct_ctxs_by_slot[0]); // --------------finish init context--------------- - Status status = file_reader->init(ctx); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->init(ctx)); EXPECT_EQ(file_reader->row_group_size(), 1); auto chunk = std::make_shared(); chunk->append_column(ColumnHelper::create_column(TYPE_VARCHAR_DESC, true), chunk->num_columns()); - status = file_reader->get_next(&chunk); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->get_next(&chunk)); chunk->check_or_die(); @@ -3436,17 +2972,14 @@ TEST_F(FileReaderTest, TestReadNoMinMaxStatistics) { } TEST_F(FileReaderTest, TestIsNotNullStatistics) { - auto file = _create_file(_file1_path); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(_file1_path), _mock_datacache_options()); - // init + auto file_reader = _create_file_reader(_file1_path); + auto* ctx = _create_file1_base_context(); std::vector t_conjuncts; ParquetUTBase::is_null_pred(0, false, &t_conjuncts); ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &ctx->conjunct_ctxs_by_slot[0]); - Status status = file_reader->init(ctx); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->init(ctx)); EXPECT_EQ(file_reader->row_group_size(), 0); } @@ -3456,18 +2989,15 @@ TEST_F(FileReaderTest, TestIsNullStatistics) { auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), std::filesystem::file_size(small_page_file)); - auto ctx = _create_file_random_read_context(small_page_file); + Utils::SlotDesc slot_descs[] = { + {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}, + }; + auto ctx = _create_file_random_read_context(small_page_file, slot_descs); std::vector t_conjuncts; ParquetUTBase::is_null_pred(0, true, &t_conjuncts); ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &ctx->conjunct_ctxs_by_slot[0]); // setup OlapScanConjunctsManager - TypeDescriptor type_array(LogicalType::TYPE_ARRAY); - type_array.children.emplace_back(TYPE_INT_DESC); - // tuple desc - Utils::SlotDesc slot_descs[] = { - {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", type_array}, {""}, - }; TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); ParquetUTBase::setup_conjuncts_manager(ctx->conjunct_ctxs_by_slot[0], tuple_desc, _runtime_state, ctx); @@ -3478,11 +3008,12 @@ TEST_F(FileReaderTest, TestIsNullStatistics) { TEST_F(FileReaderTest, TestMapKeyIsStruct) { const std::string filename = "./be/test/formats/parquet/test_data/map_key_is_struct.parquet"; - auto file = _create_file(filename); - auto file_reader = - std::make_shared(config::vector_chunk_size, file.get(), std::filesystem::file_size(filename)); - auto ctx = _create_file_random_read_context(filename); + auto file_reader = _create_file_reader(filename); + Utils::SlotDesc slot_descs[] = { + {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}, + }; + auto ctx = _create_file_random_read_context(filename, slot_descs); Status status = file_reader->init(ctx); ASSERT_FALSE(status.ok()); ASSERT_EQ("Map keys must be primitive type.", status.message()); @@ -3491,11 +3022,12 @@ TEST_F(FileReaderTest, TestMapKeyIsStruct) { TEST_F(FileReaderTest, TestInFilterStatitics) { // there are 4 row groups const std::string multi_rg_file = "./be/test/formats/parquet/test_data/page_index_big_page.parquet"; - auto file = _create_file(multi_rg_file); - auto file_reader = std::make_shared(config::vector_chunk_size, file.get(), - std::filesystem::file_size(multi_rg_file)); - auto ctx = _create_file_random_read_context(multi_rg_file); + auto file_reader = _create_file_reader(multi_rg_file); + Utils::SlotDesc slot_descs[] = { + {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", TYPE_INT_ARRAY_DESC}, {""}, + }; + auto ctx = _create_file_random_read_context(multi_rg_file, slot_descs); // min value and max value in this file, so it will be in the first and last row group std::set in_oprands{1, 100000}; std::vector t_conjuncts; @@ -3503,17 +3035,10 @@ TEST_F(FileReaderTest, TestInFilterStatitics) { ParquetUTBase::create_conjunct_ctxs(&_pool, _runtime_state, &t_conjuncts, &ctx->conjunct_ctxs_by_slot[0]); // setup OlapScanConjunctsManager - TypeDescriptor type_array(LogicalType::TYPE_ARRAY); - type_array.children.emplace_back(TYPE_INT_DESC); - // tuple desc - Utils::SlotDesc slot_descs[] = { - {"c0", TYPE_INT_DESC}, {"c1", TYPE_INT_DESC}, {"c2", TYPE_VARCHAR_DESC}, {"c3", type_array}, {""}, - }; TupleDescriptor* tuple_desc = Utils::create_tuple_descriptor(_runtime_state, &_pool, slot_descs); ParquetUTBase::setup_conjuncts_manager(ctx->conjunct_ctxs_by_slot[0], tuple_desc, _runtime_state, ctx); - Status status = file_reader->init(ctx); - ASSERT_TRUE(status.ok()); + ASSERT_OK(file_reader->init(ctx)); EXPECT_EQ(file_reader->row_group_size(), 2); } @@ -3523,16 +3048,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_1) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 1; - auto file = _create_file(_filter_row_group_path_1); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_1); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_1); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, false); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 1); } @@ -3542,16 +3064,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_2) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 1; - auto file = _create_file(_filter_row_group_path_1); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_1); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_1); auto ret = _create_context_for_filter_row_group_1(slot_id, 2, 5, false); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 2); } @@ -3561,16 +3080,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_3) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 1; - auto file = _create_file(_filter_row_group_path_1); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_1); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_1); auto ret = _create_context_for_filter_row_group_1(slot_id, 7, 10, false); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 0); } @@ -3580,16 +3096,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_4) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 1; - auto file = _create_file(_filter_row_group_path_2); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_2); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_2); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, true); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 2); } @@ -3600,16 +3113,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_5) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 3; - auto file = _create_file(_filter_row_group_path_2); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_2); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_2); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, true); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 2); } @@ -3620,16 +3130,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_6) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 4; - auto file = _create_file(_filter_row_group_path_2); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_2); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_2); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, true); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 0); } @@ -3640,16 +3147,13 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_7) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 5; - auto file = _create_file(_filter_row_group_path_2); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_2); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_2); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, true); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 2); } @@ -3660,39 +3164,173 @@ TEST_F(FileReaderTest, filter_row_group_with_rf_8) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); SlotId slot_id = 8; - auto file = _create_file(_filter_row_group_path_2); - uint64_t file_size = std::filesystem::file_size(_filter_row_group_path_2); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + + auto file_reader = _create_file_reader(_filter_row_group_path_2); auto ret = _create_context_for_filter_row_group_1(slot_id, 5, 6, true); ASSERT_TRUE(ret.ok()); - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); - + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 2); } TEST_F(FileReaderTest, filter_page_index_with_rf_has_null) { config::parquet_advance_zonemap_filter = false; DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); - SlotId slot_id = 1; - auto file = _create_file(_filter_page_index_with_rf_has_null); - uint64_t file_size = std::filesystem::file_size(_filter_page_index_with_rf_has_null); - auto file_reader = std::make_shared(_chunk_size, file.get(), file_size, _mock_datacache_options()); + auto file_reader = _create_file_reader(_filter_page_index_with_rf_has_null); auto ret = _create_context_for_filter_page_index(slot_id, 92880, 92990, true); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([0,20000), [40000,40100))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_bool) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_bool(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([0,90000))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_smallint) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_smallint(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_int) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_int32(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_bigint) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_int64(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_datetime) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_datetime(slot_id); + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} + +TEST_F(FileReaderTest, all_type_has_null_page_string) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; - Status st = file_reader->init(ret.value()); - ASSERT_TRUE(st.ok()); + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_string(slot_id); + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); ASSERT_EQ(file_reader->row_group_size(), 1); + const auto& group_readers = file_reader->group_readers(); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} +TEST_F(FileReaderTest, all_type_has_null_page_decimal) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_has_null_page_file); + auto ret = _create_context_for_has_null_page_decimal(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); const auto& group_readers = file_reader->group_readers(); - auto range = group_readers[0]->get_range(); - ASSERT_EQ(range.to_string(), "([0,20000), [40000,40100))"); + ASSERT_EQ(group_readers[0]->get_range().to_string(), "([40000,90000))"); +} + +TEST_F(FileReaderTest, all_null_group_in_filter) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 1; + + auto file_reader = _create_file_reader(_all_null_parquet_file); + auto ret = _create_context_for_in_filter(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 0); +} + +TEST_F(FileReaderTest, in_filter_filter_one_group) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 1; + + auto file_reader = _create_file_reader(_filter_row_group_path_1); + auto ret = _create_context_for_in_filter_normal(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 1); +} + +TEST_F(FileReaderTest, min_max_filter_all_null_group) { + config::parquet_advance_zonemap_filter = false; + DeferOp defer([&]() { config::parquet_advance_zonemap_filter = true; }); + SlotId slot_id = 0; + + auto file_reader = _create_file_reader(_all_null_parquet_file); + auto ret = _create_context_for_min_max_all_null_group(slot_id); + + ASSERT_TRUE(ret.ok()); + ASSERT_OK(file_reader->init(ret.value())); + ASSERT_EQ(file_reader->row_group_size(), 0); } } // namespace starrocks::parquet diff --git a/be/test/formats/parquet/parquet_ut_base.cpp b/be/test/formats/parquet/parquet_ut_base.cpp index 41e9963bf8995..c3c03387a745f 100644 --- a/be/test/formats/parquet/parquet_ut_base.cpp +++ b/be/test/formats/parquet/parquet_ut_base.cpp @@ -20,6 +20,7 @@ #include "gen_cpp/Types_types.h" #include "storage/predicate_parser.h" #include "testutil/assert.h" +#include "testutil/exprs_test_helper.h" #include "types/logical_type.h" namespace starrocks::parquet { @@ -31,84 +32,98 @@ void ParquetUTBase::create_conjunct_ctxs(ObjectPool* pool, RuntimeState* runtime ASSERT_OK(Expr::open(*conjunct_ctxs, runtime_state)); } +void ParquetUTBase::append_decimal_conjunct(TExprOpcode::type opcode, SlotId slot_id, const std::string& value, + std::vector* tExprs) { + TTypeDesc decimal_type = ExprsTestHelper::create_decimal_type_desc(TPrimitiveType::DECIMAL128, 27, 9); + + TExprNode binary_pred = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::DECIMAL128, opcode); + TExprNode decimal_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, decimal_type, true); + TExprNode decimal_literal = ExprsTestHelper::create_decimal_literal(value, decimal_type, false); + + TExpr t_expr; + t_expr.nodes.emplace_back(binary_pred); + t_expr.nodes.emplace_back(decimal_col_ref); + t_expr.nodes.emplace_back(decimal_literal); + + tExprs->emplace_back(t_expr); +} + +void ParquetUTBase::append_smallint_conjunct(TExprOpcode::type opcode, SlotId slot_id, int value, + std::vector* tExprs) { + TTypeDesc smallint_type = ExprsTestHelper::create_scalar_type_desc(TPrimitiveType::SMALLINT); + + TExprNode pred_node = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::SMALLINT, opcode); + TExprNode smallint_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, smallint_type, true); + TExprNode smallint_literal = ExprsTestHelper::create_int_literal(value, smallint_type, false); + + TExpr t_expr; + t_expr.nodes.emplace_back(pred_node); + t_expr.nodes.emplace_back(smallint_col_ref); + t_expr.nodes.emplace_back(smallint_literal); + + tExprs->emplace_back(t_expr); +} + void ParquetUTBase::append_int_conjunct(TExprOpcode::type opcode, SlotId slot_id, int value, std::vector* tExprs) { - std::vector nodes; + TTypeDesc int_type = ExprsTestHelper::create_scalar_type_desc(TPrimitiveType::INT); - TExprNode node0; - node0.node_type = TExprNodeType::BINARY_PRED; - node0.opcode = opcode; - node0.child_type = TPrimitiveType::INT; - node0.num_children = 2; - node0.__isset.opcode = true; - node0.__isset.child_type = true; - node0.type = gen_type_desc(TPrimitiveType::BOOLEAN); - nodes.emplace_back(node0); + TExprNode pred_node = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::INT, opcode); + TExprNode int_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, int_type, true); + TExprNode int_literal = ExprsTestHelper::create_int_literal(value, int_type, false); - TExprNode node1; - node1.node_type = TExprNodeType::SLOT_REF; - node1.type = gen_type_desc(TPrimitiveType::INT); - node1.num_children = 0; - TSlotRef t_slot_ref = TSlotRef(); - t_slot_ref.slot_id = slot_id; - t_slot_ref.tuple_id = 0; - node1.__set_slot_ref(t_slot_ref); - node1.is_nullable = true; - nodes.emplace_back(node1); + TExpr t_expr; + t_expr.nodes.emplace_back(pred_node); + t_expr.nodes.emplace_back(int_col_ref); + t_expr.nodes.emplace_back(int_literal); - TExprNode node2; - node2.node_type = TExprNodeType::INT_LITERAL; - node2.type = gen_type_desc(TPrimitiveType::INT); - node2.num_children = 0; - TIntLiteral int_literal; - int_literal.value = value; - node2.__set_int_literal(int_literal); - node2.is_nullable = false; - nodes.emplace_back(node2); + tExprs->emplace_back(t_expr); +} + +void ParquetUTBase::append_bigint_conjunct(TExprOpcode::type opcode, SlotId slot_id, int64_t value, + std::vector* tExprs) { + TTypeDesc int_type = ExprsTestHelper::create_scalar_type_desc(TPrimitiveType::BIGINT); + + TExprNode pred_node = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::BIGINT, opcode); + TExprNode int_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, int_type, true); + TExprNode int_literal = ExprsTestHelper::create_int_literal(value, int_type, false); TExpr t_expr; - t_expr.nodes = nodes; + t_expr.nodes.emplace_back(pred_node); + t_expr.nodes.emplace_back(int_col_ref); + t_expr.nodes.emplace_back(int_literal); tExprs->emplace_back(t_expr); } -void ParquetUTBase::append_string_conjunct(TExprOpcode::type opcode, starrocks::SlotId slot_id, std::string value, - std::vector* tExprs) { - std::vector nodes; +void ParquetUTBase::append_datetime_conjunct(TExprOpcode::type opcode, SlotId slot_id, const std::string& value, + std::vector* tExprs) { + TTypeDesc datetime_type = ExprsTestHelper::create_scalar_type_desc(TPrimitiveType::DATETIME); - TExprNode node0; - node0.node_type = TExprNodeType::BINARY_PRED; - node0.opcode = opcode; - node0.child_type = TPrimitiveType::VARCHAR; - node0.num_children = 2; - node0.__isset.opcode = true; - node0.__isset.child_type = true; - node0.type = gen_type_desc(TPrimitiveType::BOOLEAN); - nodes.emplace_back(node0); + TExprNode pred_node = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::DATETIME, opcode); + TExprNode datetime_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, datetime_type, true); + TExprNode datetime_literal = ExprsTestHelper::create_date_literal(value, datetime_type, false); - TExprNode node1; - node1.node_type = TExprNodeType::SLOT_REF; - node1.type = gen_type_desc(TPrimitiveType::VARCHAR); - node1.num_children = 0; - TSlotRef t_slot_ref = TSlotRef(); - t_slot_ref.slot_id = slot_id; - t_slot_ref.tuple_id = 0; - node1.__set_slot_ref(t_slot_ref); - node1.is_nullable = true; - nodes.emplace_back(node1); + TExpr t_expr; + t_expr.nodes.emplace_back(pred_node); + t_expr.nodes.emplace_back(datetime_col_ref); + t_expr.nodes.emplace_back(datetime_literal); + + tExprs->emplace_back(t_expr); +} - TExprNode node2; - node2.node_type = TExprNodeType::STRING_LITERAL; - node2.type = gen_type_desc(TPrimitiveType::VARCHAR); - node2.num_children = 0; - TStringLiteral string_literal; - string_literal.value = value; - node2.__set_string_literal(string_literal); - node2.is_nullable = false; - nodes.emplace_back(node2); +void ParquetUTBase::append_string_conjunct(TExprOpcode::type opcode, starrocks::SlotId slot_id, std::string value, + std::vector* tExprs) { + TTypeDesc varchar_type = ExprsTestHelper::create_varchar_type_desc(10); + + TExprNode pre_node = ExprsTestHelper::create_binary_pred_node(TPrimitiveType::VARCHAR, opcode); + TExprNode varchar_col_ref = ExprsTestHelper::create_slot_expr_node(0, slot_id, varchar_type, true); + TExprNode varchar_literal = ExprsTestHelper::create_varchar_literal(value, varchar_type, false); TExpr t_expr; - t_expr.nodes = nodes; + t_expr.nodes.emplace_back(pre_node); + t_expr.nodes.emplace_back(varchar_col_ref); + t_expr.nodes.emplace_back(varchar_literal); tExprs->emplace_back(t_expr); } diff --git a/be/test/formats/parquet/parquet_ut_base.h b/be/test/formats/parquet/parquet_ut_base.h index 3b477bcc45e04..35e848a4753d4 100644 --- a/be/test/formats/parquet/parquet_ut_base.h +++ b/be/test/formats/parquet/parquet_ut_base.h @@ -32,7 +32,15 @@ class ParquetUTBase { static void create_conjunct_ctxs(ObjectPool* pool, RuntimeState* runtime_state, std::vector* tExprs, std::vector* conjunct_ctxs); + static void append_smallint_conjunct(TExprOpcode::type opcode, SlotId slot_id, int value, + std::vector* tExprs); static void append_int_conjunct(TExprOpcode::type opcode, SlotId slot_id, int value, std::vector* tExprs); + static void append_bigint_conjunct(TExprOpcode::type opcode, SlotId slot_id, int64_t value, + std::vector* tExprs); + static void append_datetime_conjunct(TExprOpcode::type opcode, SlotId slot_id, const std::string& value, + std::vector* tExprs); + static void append_decimal_conjunct(TExprOpcode::type opcode, SlotId slot_id, const std::string& value, + std::vector* tExprs); static void append_string_conjunct(TExprOpcode::type opcode, SlotId slot_id, std::string value, std::vector* tExprs); diff --git a/be/test/formats/parquet/statistics_helper_test.cpp b/be/test/formats/parquet/statistics_helper_test.cpp index f91deaf1a88ac..efa88e19e90e4 100644 --- a/be/test/formats/parquet/statistics_helper_test.cpp +++ b/be/test/formats/parquet/statistics_helper_test.cpp @@ -16,7 +16,6 @@ #include -#include "formats/parquet/parquet_test_util/util.h" #include "formats/parquet/parquet_ut_base.h" #include "formats/parquet/schema.h" #include "testutil/assert.h" @@ -60,9 +59,10 @@ TEST_F(StatisticsHelperTest, TestInFilterInt) { EXPECT_EQ(ctxs.size(), 1); std::vector null_counts{0, 0}; + std::vector null_pages{false, false}; Filter selected(min_values.size(), true); - auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctxs[0], &field, - timezone, selected); + auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, ctxs[0], + &field, timezone, selected); ASSERT_OK(st); ASSERT_TRUE(selected[0]); ASSERT_FALSE(selected[1]); @@ -85,9 +85,10 @@ TEST_F(StatisticsHelperTest, TestInFilterString) { EXPECT_EQ(ctxs.size(), 1); std::vector null_counts{0}; + std::vector null_pages{false}; Filter selected(min_values.size(), true); - auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctxs[0], &field, - timezone, selected); + auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, ctxs[0], + &field, timezone, selected); ASSERT_OK(st); ASSERT_FALSE(selected[0]); } @@ -102,9 +103,10 @@ TEST_F(StatisticsHelperTest, TestInFilterString) { EXPECT_EQ(ctxs.size(), 1); std::vector null_counts{0}; + std::vector null_pages{false}; Filter selected(min_values.size(), true); - auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctxs[0], &field, - timezone, selected); + auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, ctxs[0], + &field, timezone, selected); ASSERT_OK(st); ASSERT_TRUE(selected[0]); } @@ -133,9 +135,10 @@ TEST_F(StatisticsHelperTest, TestInFilterDate) { EXPECT_EQ(ctxs.size(), 1); std::vector null_counts{0, 0}; + std::vector null_pages{false, false}; Filter selected(min_values.size(), true); - auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctxs[0], &field, - timezone, selected); + auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, ctxs[0], + &field, timezone, selected); ASSERT_OK(st); ASSERT_TRUE(selected[0]); ASSERT_FALSE(selected[1]); @@ -170,9 +173,10 @@ TEST_F(StatisticsHelperTest, TestInFilterDatetime) { EXPECT_EQ(ctxs.size(), 1); std::vector null_counts{0, 0}; + std::vector null_pages{false, false}; Filter selected(min_values.size(), true); - auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_counts, ctxs[0], &field, - timezone, selected); + auto st = StatisticsHelper::in_filter_on_min_max_stat(min_values, max_values, null_pages, null_counts, ctxs[0], + &field, timezone, selected); ASSERT_OK(st); ASSERT_TRUE(selected[0]); ASSERT_FALSE(selected[1]); diff --git a/be/test/formats/parquet/test_data/all_null.parquet b/be/test/formats/parquet/test_data/all_null.parquet new file mode 100644 index 0000000000000..1aed28ccf7607 Binary files /dev/null and b/be/test/formats/parquet/test_data/all_null.parquet differ diff --git a/be/test/formats/parquet/test_data/has_null_page.parquet b/be/test/formats/parquet/test_data/has_null_page.parquet new file mode 100644 index 0000000000000..fceea2f8607f9 Binary files /dev/null and b/be/test/formats/parquet/test_data/has_null_page.parquet differ diff --git a/test/sql/test_iceberg/R/test_one_group_all_null b/test/sql/test_iceberg/R/test_one_group_all_null new file mode 100644 index 0000000000000..91fe68bd6d43e --- /dev/null +++ b/test/sql/test_iceberg/R/test_one_group_all_null @@ -0,0 +1,11 @@ +-- name: test_one_group_all_null +create external catalog test_one_group_all_null_${uuid0} PROPERTIES ("type"="iceberg", "iceberg.catalog.type"="hive", "iceberg.catalog.hive.metastore.uris"="${iceberg_catalog_hive_metastore_uris}","enable_iceberg_metadata_cache"="true","aws.s3.access_key" = "${oss_ak}","aws.s3.secret_key" = "${oss_sk}","aws.s3.endpoint" = "${oss_endpoint}"); +-- result: +-- !result +select count(*) from test_one_group_all_null_${uuid0}.iceberg_oss_db.one_group_all_null where c2 is null; +-- result: +3 +-- !result +drop catalog test_one_group_all_null_${uuid0}; +-- result: +-- !result \ No newline at end of file diff --git a/test/sql/test_iceberg/T/test_one_group_all_null b/test/sql/test_iceberg/T/test_one_group_all_null new file mode 100644 index 0000000000000..c6debc49e011c --- /dev/null +++ b/test/sql/test_iceberg/T/test_one_group_all_null @@ -0,0 +1,7 @@ +-- name: test_one_group_all_null + +create external catalog test_one_group_all_null_${uuid0} PROPERTIES ("type"="iceberg", "iceberg.catalog.type"="hive", "iceberg.catalog.hive.metastore.uris"="${iceberg_catalog_hive_metastore_uris}","enable_iceberg_metadata_cache"="true","aws.s3.access_key" = "${oss_ak}","aws.s3.secret_key" = "${oss_sk}","aws.s3.endpoint" = "${oss_endpoint}"); + +select count(*) from test_one_group_all_null_${uuid0}.iceberg_oss_db.one_group_all_null where c2 is null; + +drop catalog test_one_group_all_null_${uuid0};