From e1c4f8f26a6562dfc1dcb2e9058ac62ef2163337 Mon Sep 17 00:00:00 2001 From: zombee0 Date: Fri, 3 Jan 2025 15:17:10 +0800 Subject: [PATCH] [Refactor]Abstract out the logictype-independent RawColumnReader for future optimization Signed-off-by: zombee0 --- .../formats/parquet/scalar_column_reader.cpp | 338 +++++++++--------- be/src/formats/parquet/scalar_column_reader.h | 110 +++--- 2 files changed, 241 insertions(+), 207 deletions(-) diff --git a/be/src/formats/parquet/scalar_column_reader.cpp b/be/src/formats/parquet/scalar_column_reader.cpp index 47fb03677ed5b..cd3fa5089a671 100644 --- a/be/src/formats/parquet/scalar_column_reader.cpp +++ b/be/src/formats/parquet/scalar_column_reader.cpp @@ -24,6 +24,8 @@ namespace starrocks::parquet { +// FixedValueColumnReader + StatusOr FixedValueColumnReader::row_group_zone_map_filter(const std::vector& predicates, CompoundNodeType pred_relation, const uint64_t rg_first_row, @@ -45,153 +47,10 @@ StatusOr FixedValueColumnReader::page_index_zone_map_filter(const std::vec return !ZoneMapEvaluatorUtils::is_satisfy(predicates, zone_map, pred_relation); } -Status ScalarColumnReader::read_range(const Range& range, const Filter* filter, ColumnPtr& dst) { - DCHECK(get_column_parquet_field()->is_nullable ? dst->is_nullable() : true); - _need_lazy_decode = - _dict_filter_ctx != nullptr || (_can_lazy_decode && filter != nullptr && - SIMD::count_nonzero(*filter) * 1.0 / filter->size() < FILTER_RATIO); - ColumnContentType content_type = !_need_lazy_decode ? ColumnContentType::VALUE : ColumnContentType::DICT_CODE; - if (_need_lazy_decode) { - if (_dict_code == nullptr) { - _dict_code = ColumnHelper::create_column( - TypeDescriptor::from_logical_type(ColumnDictFilterContext::kDictCodePrimitiveType), true); - } - _ori_column = dst; - dst = _dict_code; - dst->reserve(range.span_size()); - } - if (!_converter->need_convert) { - SCOPED_RAW_TIMER(&_opts.stats->column_read_ns); - return _reader->read_range(range, filter, content_type, dst.get()); - } else { - auto column = _converter->create_src_column(); - { - SCOPED_RAW_TIMER(&_opts.stats->column_read_ns); - RETURN_IF_ERROR(_reader->read_range(range, filter, content_type, column.get())); - } - SCOPED_RAW_TIMER(&_opts.stats->column_convert_ns); - return _converter->convert(column, dst.get()); - } -} - -bool ScalarColumnReader::try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId, - const std::vector& sub_field_path, const size_t& layer) { - if (sub_field_path.size() != layer) { - return false; - } - - if (!_col_type->is_string_type()) { - return false; - } - - if (_column_all_pages_dict_encoded()) { - if (_dict_filter_ctx == nullptr) { - _dict_filter_ctx = std::make_unique(); - _dict_filter_ctx->is_decode_needed = is_decode_needed; - _dict_filter_ctx->sub_field_path = sub_field_path; - _dict_filter_ctx->slot_id = slotId; - } - _dict_filter_ctx->conjunct_ctxs.push_back(ctx); - return true; - } else { - return false; - } -} - -Status ScalarColumnReader::fill_dst_column(ColumnPtr& dst, ColumnPtr& src) { - if (!_need_lazy_decode) { - dst->swap_column(*src); - } else { - if (_dict_filter_ctx == nullptr || _dict_filter_ctx->is_decode_needed) { - ColumnPtr& dict_values = dst; - dict_values->reserve(src->size()); - - // decode dict code to dict values. - // note that in dict code, there could be null value. - const ColumnPtr& dict_codes = src; - auto* codes_nullable_column = ColumnHelper::as_raw_column(dict_codes); - auto* codes_column = - ColumnHelper::as_raw_column>(codes_nullable_column->data_column()); - RETURN_IF_ERROR( - _reader->get_dict_values(codes_column->get_data(), *codes_nullable_column, dict_values.get())); - DCHECK_EQ(dict_codes->size(), dict_values->size()); - if (dict_values->is_nullable()) { - auto* nullable_codes = down_cast(dict_codes.get()); - auto* nullable_values = down_cast(dict_values.get()); - nullable_values->null_column_data().swap(nullable_codes->null_column_data()); - nullable_values->set_has_null(nullable_codes->has_null()); - } - } else { - dst->append_default(src->size()); - } - - src->reset_column(); - src = _ori_column; - } - return Status::OK(); -} - -bool ScalarColumnReader::_column_all_pages_dict_encoded() { - // The Parquet spec allows for column chunks to have mixed encodings - // where some data pages are dictionary-encoded and others are plain - // encoded. For example, a Parquet file writer might start writing - // a column chunk as dictionary encoded, but it will switch to plain - // encoding if the dictionary grows too large. - // - // In order for dictionary filters to skip the entire row group, - // the conjuncts must be evaluated on column chunks that are entirely - // encoded with the dictionary encoding. There are two checks - // available to verify this: - // 1. The encoding_stats field on the column chunk metadata provides - // information about the number of data pages written in each - // format. This allows for a specific check of whether all the - // data pages are dictionary encoded. - // 2. The encodings field on the column chunk metadata lists the - // encodings used. If this list contains the dictionary encoding - // and does not include unexpected encodings (i.e. encodings not - // associated with definition/repetition levels), then it is entirely - // dictionary encoded. - const tparquet::ColumnMetaData& column_metadata = get_chunk_metadata()->meta_data; - if (column_metadata.__isset.encoding_stats) { - // Condition #1 above - for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) { - if (enc_stat.page_type == tparquet::PageType::DATA_PAGE && - (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY && - enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) && - enc_stat.count > 0) { - return false; - } - } - } else { - // Condition #2 above - bool has_dict_encoding = false; - bool has_nondict_encoding = false; - for (const tparquet::Encoding::type& encoding : column_metadata.encodings) { - if (encoding == tparquet::Encoding::PLAIN_DICTIONARY || encoding == tparquet::Encoding::RLE_DICTIONARY) { - has_dict_encoding = true; - } - - // RLE and BIT_PACKED are used for repetition/definition levels - if (encoding != tparquet::Encoding::PLAIN_DICTIONARY && encoding != tparquet::Encoding::RLE_DICTIONARY && - encoding != tparquet::Encoding::RLE && encoding != tparquet::Encoding::BIT_PACKED) { - has_nondict_encoding = true; - break; - } - } - // Not entirely dictionary encoded if: - // 1. No dictionary encoding listed - // OR - // 2. Some non-dictionary encoding is listed - if (!has_dict_encoding || has_nondict_encoding) { - return false; - } - } - - return true; -} +// RawColumnReader -void ScalarColumnReader::collect_column_io_range(std::vector* ranges, - int64_t* end_offset, ColumnIOType type, bool active) { +void RawColumnReader::collect_column_io_range(std::vector* ranges, + int64_t* end_offset, ColumnIOType type, bool active) { const auto& column = *get_chunk_metadata(); if (type == ColumnIOType::PAGES) { const tparquet::ColumnMetaData& column_metadata = column.meta_data; @@ -231,7 +90,7 @@ void ScalarColumnReader::collect_column_io_range(std::vector& range, const uint64_t rg_first_row) { +void RawColumnReader::select_offset_index(const SparseRange& range, const uint64_t rg_first_row) { if (_offset_index_ctx == nullptr) { if (!get_chunk_metadata()->__isset.offset_index_offset) { return; @@ -277,10 +136,69 @@ void ScalarColumnReader::select_offset_index(const SparseRange& range, _reader = std::make_unique(std::move(_reader), _offset_index_ctx.get(), has_dict_page); } -StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vector& predicates, - CompoundNodeType pred_relation, - const uint64_t rg_first_row, - const uint64_t rg_num_rows) const { +bool RawColumnReader::column_all_pages_dict_encoded() const { + // The Parquet spec allows for column chunks to have mixed encodings + // where some data pages are dictionary-encoded and others are plain + // encoded. For example, a Parquet file writer might start writing + // a column chunk as dictionary encoded, but it will switch to plain + // encoding if the dictionary grows too large. + // + // In order for dictionary filters to skip the entire row group, + // the conjuncts must be evaluated on column chunks that are entirely + // encoded with the dictionary encoding. There are two checks + // available to verify this: + // 1. The encoding_stats field on the column chunk metadata provides + // information about the number of data pages written in each + // format. This allows for a specific check of whether all the + // data pages are dictionary encoded. + // 2. The encodings field on the column chunk metadata lists the + // encodings used. If this list contains the dictionary encoding + // and does not include unexpected encodings (i.e. encodings not + // associated with definition/repetition levels), then it is entirely + // dictionary encoded. + const tparquet::ColumnMetaData& column_metadata = get_chunk_metadata()->meta_data; + if (column_metadata.__isset.encoding_stats) { + // Condition #1 above + for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) { + if (enc_stat.page_type == tparquet::PageType::DATA_PAGE && + (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY && + enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) && + enc_stat.count > 0) { + return false; + } + } + } else { + // Condition #2 above + bool has_dict_encoding = false; + bool has_nondict_encoding = false; + for (const tparquet::Encoding::type& encoding : column_metadata.encodings) { + if (encoding == tparquet::Encoding::PLAIN_DICTIONARY || encoding == tparquet::Encoding::RLE_DICTIONARY) { + has_dict_encoding = true; + } + + // RLE and BIT_PACKED are used for repetition/definition levels + if (encoding != tparquet::Encoding::PLAIN_DICTIONARY && encoding != tparquet::Encoding::RLE_DICTIONARY && + encoding != tparquet::Encoding::RLE && encoding != tparquet::Encoding::BIT_PACKED) { + has_nondict_encoding = true; + break; + } + } + // Not entirely dictionary encoded if: + // 1. No dictionary encoding listed + // OR + // 2. Some non-dictionary encoding is listed + if (!has_dict_encoding || has_nondict_encoding) { + return false; + } + } + + return true; +} + +StatusOr RawColumnReader::_row_group_zone_map_filter(const std::vector& predicates, + CompoundNodeType pred_relation, + const TypeDescriptor& col_type, const uint64_t rg_first_row, + const uint64_t rg_num_rows) const { if (!get_chunk_metadata()->meta_data.__isset.statistics || get_column_parquet_field() == nullptr) { // statistics is not existed, select all return true; @@ -299,8 +217,8 @@ StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vector zone_map_detail = std::nullopt; // used to hold min/max slice values - const ColumnPtr min_column = ColumnHelper::create_column(*_col_type, true); - const ColumnPtr max_column = ColumnHelper::create_column(*_col_type, true); + const ColumnPtr min_column = ColumnHelper::create_column(col_type, true); + const ColumnPtr max_column = ColumnHelper::create_column(col_type, true); if (is_all_null) { // if the entire column's value is null, the min/max value not existed zone_map_detail = ZoneMapDetail{Datum{}, Datum{}, true}; @@ -310,12 +228,12 @@ StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vector max_values; std::vector null_pages{false}; Status st = - StatisticsHelper::get_min_max_value(_opts.file_meta_data, *_col_type, &get_chunk_metadata()->meta_data, + StatisticsHelper::get_min_max_value(_opts.file_meta_data, col_type, &get_chunk_metadata()->meta_data, get_column_parquet_field(), min_values, max_values); if (st.ok()) { - RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, *_col_type, + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, col_type, get_column_parquet_field(), _opts.timezone)); - RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, *_col_type, + RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, col_type, get_column_parquet_field(), _opts.timezone)); zone_map_detail = ZoneMapDetail{min_column->get(0), max_column->get(0), has_null}; @@ -331,10 +249,11 @@ StatusOr ScalarColumnReader::row_group_zone_map_filter(const std::vector ScalarColumnReader::page_index_zone_map_filter(const std::vector& predicates, - SparseRange* row_ranges, - CompoundNodeType pred_relation, - const uint64_t rg_first_row, const uint64_t rg_num_rows) { +StatusOr RawColumnReader::_page_index_zone_map_filter(const std::vector& predicates, + SparseRange* row_ranges, + CompoundNodeType pred_relation, + const TypeDescriptor& col_type, const uint64_t rg_first_row, + const uint64_t rg_num_rows) { DCHECK(row_ranges->empty()); const tparquet::ColumnChunk* chunk_meta = get_chunk_metadata(); if (!chunk_meta->__isset.column_index_offset || !chunk_meta->__isset.offset_index_offset || @@ -360,22 +279,22 @@ StatusOr ScalarColumnReader::page_index_zone_map_filter(const std::vector< const size_t page_num = column_index.min_values.size(); const std::vector null_pages = column_index.null_pages; - ColumnPtr min_column = ColumnHelper::create_column(*_col_type, true); - ColumnPtr max_column = ColumnHelper::create_column(*_col_type, true); + ColumnPtr min_column = ColumnHelper::create_column(col_type, true); + ColumnPtr max_column = ColumnHelper::create_column(col_type, true); // deal with min_values - auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, null_pages, *_col_type, + auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, null_pages, col_type, get_column_parquet_field(), _opts.timezone); if (!st.ok()) { // swallow error status - LOG(INFO) << "Error when decode min/max statistics, type " << _col_type->debug_string(); + LOG(INFO) << "Error when decode min/max statistics, type " << col_type.debug_string(); return false; } // deal with max_values - st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, null_pages, *_col_type, + st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, null_pages, col_type, get_column_parquet_field(), _opts.timezone); if (!st.ok()) { // swallow error status - LOG(INFO) << "Error when decode min/max statistics, type " << _col_type->debug_string(); + LOG(INFO) << "Error when decode min/max statistics, type " << col_type.debug_string(); return false; } @@ -420,4 +339,91 @@ StatusOr ScalarColumnReader::page_index_zone_map_filter(const std::vector< return true; } +// ScalarColumnReader + +Status ScalarColumnReader::read_range(const Range& range, const Filter* filter, ColumnPtr& dst) { + DCHECK(get_column_parquet_field()->is_nullable ? dst->is_nullable() : true); + _need_lazy_decode = + _dict_filter_ctx != nullptr || (_can_lazy_decode && filter != nullptr && + SIMD::count_nonzero(*filter) * 1.0 / filter->size() < FILTER_RATIO); + ColumnContentType content_type = !_need_lazy_decode ? ColumnContentType::VALUE : ColumnContentType::DICT_CODE; + if (_need_lazy_decode) { + if (_dict_code == nullptr) { + _dict_code = ColumnHelper::create_column( + TypeDescriptor::from_logical_type(ColumnDictFilterContext::kDictCodePrimitiveType), true); + } + _ori_column = dst; + dst = _dict_code; + dst->reserve(range.span_size()); + } + if (!_converter->need_convert) { + SCOPED_RAW_TIMER(&_opts.stats->column_read_ns); + return _reader->read_range(range, filter, content_type, dst.get()); + } else { + auto column = _converter->create_src_column(); + { + SCOPED_RAW_TIMER(&_opts.stats->column_read_ns); + RETURN_IF_ERROR(_reader->read_range(range, filter, content_type, column.get())); + } + SCOPED_RAW_TIMER(&_opts.stats->column_convert_ns); + return _converter->convert(column, dst.get()); + } +} + +bool ScalarColumnReader::try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId, + const std::vector& sub_field_path, const size_t& layer) { + if (sub_field_path.size() != layer) { + return false; + } + + if (!_col_type->is_string_type()) { + return false; + } + + if (column_all_pages_dict_encoded()) { + if (_dict_filter_ctx == nullptr) { + _dict_filter_ctx = std::make_unique(); + _dict_filter_ctx->is_decode_needed = is_decode_needed; + _dict_filter_ctx->sub_field_path = sub_field_path; + _dict_filter_ctx->slot_id = slotId; + } + _dict_filter_ctx->conjunct_ctxs.push_back(ctx); + return true; + } else { + return false; + } +} + +Status ScalarColumnReader::fill_dst_column(ColumnPtr& dst, ColumnPtr& src) { + if (!_need_lazy_decode) { + dst->swap_column(*src); + } else { + if (_dict_filter_ctx == nullptr || _dict_filter_ctx->is_decode_needed) { + ColumnPtr& dict_values = dst; + dict_values->reserve(src->size()); + + // decode dict code to dict values. + // note that in dict code, there could be null value. + const ColumnPtr& dict_codes = src; + auto* codes_nullable_column = ColumnHelper::as_raw_column(dict_codes); + auto* codes_column = + ColumnHelper::as_raw_column>(codes_nullable_column->data_column()); + RETURN_IF_ERROR( + _reader->get_dict_values(codes_column->get_data(), *codes_nullable_column, dict_values.get())); + DCHECK_EQ(dict_codes->size(), dict_values->size()); + if (dict_values->is_nullable()) { + auto* nullable_codes = down_cast(dict_codes.get()); + auto* nullable_values = down_cast(dict_values.get()); + nullable_values->null_column_data().swap(nullable_codes->null_column_data()); + nullable_values->set_has_null(nullable_codes->has_null()); + } + } else { + dst->append_default(src->size()); + } + + src->reset_column(); + src = _ori_column; + } + return Status::OK(); +} } // namespace starrocks::parquet \ No newline at end of file diff --git a/be/src/formats/parquet/scalar_column_reader.h b/be/src/formats/parquet/scalar_column_reader.h index 063a0036cd60a..773b341bb1de6 100644 --- a/be/src/formats/parquet/scalar_column_reader.h +++ b/be/src/formats/parquet/scalar_column_reader.h @@ -55,48 +55,23 @@ class FixedValueColumnReader final : public ColumnReader { const Datum _fixed_value; }; -class ScalarColumnReader final : public ColumnReader { +class RawColumnReader : public ColumnReader { public: - explicit ScalarColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata, - const TypeDescriptor* col_type, const ColumnReaderOptions& opts) - : ColumnReader(parquet_field), _opts(opts), _col_type(col_type), _chunk_metadata(column_chunk_metadata) {} - ~ScalarColumnReader() override = default; + explicit RawColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata, + const ColumnReaderOptions& opts) + : ColumnReader(parquet_field), _opts(opts), _chunk_metadata(column_chunk_metadata) {} + ~RawColumnReader() override = default; Status prepare() override { - RETURN_IF_ERROR(ColumnConverterFactory::create_converter(*get_column_parquet_field(), *_col_type, - _opts.timezone, &_converter)); return StoredColumnReader::create(_opts, get_column_parquet_field(), get_chunk_metadata(), &_reader); } - Status read_range(const Range& range, const Filter* filter, ColumnPtr& dst) override; - void get_levels(level_t** def_levels, level_t** rep_levels, size_t* num_levels) override { _reader->get_levels(def_levels, rep_levels, num_levels); } void set_need_parse_levels(bool need_parse_levels) override { _reader->set_need_parse_levels(need_parse_levels); } - bool try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId, - const std::vector& sub_field_path, const size_t& layer) override; - - Status rewrite_conjunct_ctxs_to_predicate(bool* is_group_filtered, const std::vector& sub_field_path, - const size_t& layer) override { - DCHECK_EQ(sub_field_path.size(), layer); - return _dict_filter_ctx->rewrite_conjunct_ctxs_to_predicate(_reader.get(), is_group_filtered); - } - - void set_can_lazy_decode(bool can_lazy_decode) override { - _can_lazy_decode = can_lazy_decode && _col_type->is_string_type() && _column_all_pages_dict_encoded(); - } - - Status filter_dict_column(const ColumnPtr& column, Filter* filter, const std::vector& sub_field_path, - const size_t& layer) override { - DCHECK_EQ(sub_field_path.size(), layer); - return _dict_filter_ctx->predicate->evaluate_and(column.get(), filter->data()); - } - - Status fill_dst_column(ColumnPtr& dst, ColumnPtr& src) override; - void collect_column_io_range(std::vector* ranges, int64_t* end_offset, ColumnIOType type, bool active) override; @@ -121,28 +96,81 @@ class ScalarColumnReader final : public ColumnReader { void select_offset_index(const SparseRange& range, const uint64_t rg_first_row) override; - StatusOr row_group_zone_map_filter(const std::vector& predicates, - CompoundNodeType pred_relation, uint64_t rg_first_row, - uint64_t rg_num_rows) const override; + // Returns true if all of the data pages in the column chunk are dict encoded + bool column_all_pages_dict_encoded() const; - StatusOr page_index_zone_map_filter(const std::vector& predicates, - SparseRange* row_ranges, CompoundNodeType pred_relation, - const uint64_t rg_first_row, const uint64_t rg_num_rows) override; +protected: + StatusOr _row_group_zone_map_filter(const std::vector& predicates, + CompoundNodeType pred_relation, const TypeDescriptor& col_type, + uint64_t rg_first_row, uint64_t rg_num_rows) const; -private: - // Returns true if all of the data pages in the column chunk are dict encoded - bool _column_all_pages_dict_encoded(); + StatusOr _page_index_zone_map_filter(const std::vector& predicates, + SparseRange* row_ranges, CompoundNodeType pred_relation, + const TypeDescriptor& col_type, const uint64_t rg_first_row, + const uint64_t rg_num_rows); const ColumnReaderOptions& _opts; std::unique_ptr _reader; + const tparquet::ColumnChunk* _chunk_metadata = nullptr; + std::unique_ptr _offset_index_ctx; +}; + +class ScalarColumnReader final : public RawColumnReader { +public: + explicit ScalarColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata, + const TypeDescriptor* col_type, const ColumnReaderOptions& opts) + : RawColumnReader(parquet_field, column_chunk_metadata, opts), _col_type(col_type) {} + ~ScalarColumnReader() override = default; + + Status prepare() override { + RETURN_IF_ERROR(ColumnConverterFactory::create_converter(*get_column_parquet_field(), *_col_type, + _opts.timezone, &_converter)); + return RawColumnReader::prepare(); + } + + Status read_range(const Range& range, const Filter* filter, ColumnPtr& dst) override; + + bool try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId, + const std::vector& sub_field_path, const size_t& layer) override; + + Status rewrite_conjunct_ctxs_to_predicate(bool* is_group_filtered, const std::vector& sub_field_path, + const size_t& layer) override { + DCHECK_EQ(sub_field_path.size(), layer); + return _dict_filter_ctx->rewrite_conjunct_ctxs_to_predicate(_reader.get(), is_group_filtered); + } + + void set_can_lazy_decode(bool can_lazy_decode) override { + _can_lazy_decode = can_lazy_decode && _col_type->is_string_type() && column_all_pages_dict_encoded(); + } + + Status filter_dict_column(const ColumnPtr& column, Filter* filter, const std::vector& sub_field_path, + const size_t& layer) override { + DCHECK_EQ(sub_field_path.size(), layer); + return _dict_filter_ctx->predicate->evaluate_and(column.get(), filter->data()); + } + + Status fill_dst_column(ColumnPtr& dst, ColumnPtr& src) override; + + StatusOr row_group_zone_map_filter(const std::vector& predicates, + CompoundNodeType pred_relation, const uint64_t rg_first_row, + const uint64_t rg_num_rows) const override { + return _row_group_zone_map_filter(predicates, pred_relation, *_col_type, rg_first_row, rg_num_rows); + } + + StatusOr page_index_zone_map_filter(const std::vector& predicates, + SparseRange* row_ranges, CompoundNodeType pred_relation, + const uint64_t rg_first_row, const uint64_t rg_num_rows) override { + return _page_index_zone_map_filter(predicates, row_ranges, pred_relation, *_col_type, rg_first_row, + rg_num_rows); + } + +private: std::unique_ptr _converter; std::unique_ptr _dict_filter_ctx; const TypeDescriptor* _col_type = nullptr; - const tparquet::ColumnChunk* _chunk_metadata = nullptr; - std::unique_ptr _offset_index_ctx; // _can_lazy_decode means string type and all page dict code bool _can_lazy_decode = false;