From e1c4f8f26a6562dfc1dcb2e9058ac62ef2163337 Mon Sep 17 00:00:00 2001
From: zombee0 <ewang2027@gmail.com>
Date: Fri, 3 Jan 2025 15:17:10 +0800
Subject: [PATCH] [Refactor]Abstract out the logictype-independent
 RawColumnReader for future optimization

Signed-off-by: zombee0 <ewang2027@gmail.com>
---
 .../formats/parquet/scalar_column_reader.cpp  | 338 +++++++++---------
 be/src/formats/parquet/scalar_column_reader.h | 110 +++---
 2 files changed, 241 insertions(+), 207 deletions(-)
diff --git a/be/src/formats/parquet/scalar_column_reader.cpp b/be/src/formats/parquet/scalar_column_reader.cpp
index 47fb03677ed5b..cd3fa5089a671 100644
--- a/be/src/formats/parquet/scalar_column_reader.cpp
+++ b/be/src/formats/parquet/scalar_column_reader.cpp
@@ -24,6 +24,8 @@
 
 namespace starrocks::parquet {
 
+// FixedValueColumnReader
+
 StatusOr<bool> FixedValueColumnReader::row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
                                                                  CompoundNodeType pred_relation,
                                                                  const uint64_t rg_first_row,
@@ -45,153 +47,10 @@ StatusOr<bool> FixedValueColumnReader::page_index_zone_map_filter(const std::vec
     return !ZoneMapEvaluatorUtils::is_satisfy(predicates, zone_map, pred_relation);
 }
 
-Status ScalarColumnReader::read_range(const Range<uint64_t>& range, const Filter* filter, ColumnPtr& dst) {
-    DCHECK(get_column_parquet_field()->is_nullable ? dst->is_nullable() : true);
-    _need_lazy_decode =
-            _dict_filter_ctx != nullptr || (_can_lazy_decode && filter != nullptr &&
-                                            SIMD::count_nonzero(*filter) * 1.0 / filter->size() < FILTER_RATIO);
-    ColumnContentType content_type = !_need_lazy_decode ? ColumnContentType::VALUE : ColumnContentType::DICT_CODE;
-    if (_need_lazy_decode) {
-        if (_dict_code == nullptr) {
-            _dict_code = ColumnHelper::create_column(
-                    TypeDescriptor::from_logical_type(ColumnDictFilterContext::kDictCodePrimitiveType), true);
-        }
-        _ori_column = dst;
-        dst = _dict_code;
-        dst->reserve(range.span_size());
-    }
-    if (!_converter->need_convert) {
-        SCOPED_RAW_TIMER(&_opts.stats->column_read_ns);
-        return _reader->read_range(range, filter, content_type, dst.get());
-    } else {
-        auto column = _converter->create_src_column();
-        {
-            SCOPED_RAW_TIMER(&_opts.stats->column_read_ns);
-            RETURN_IF_ERROR(_reader->read_range(range, filter, content_type, column.get()));
-        }
-        SCOPED_RAW_TIMER(&_opts.stats->column_convert_ns);
-        return _converter->convert(column, dst.get());
-    }
-}
-
-bool ScalarColumnReader::try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId,
-                                                const std::vector<std::string>& sub_field_path, const size_t& layer) {
-    if (sub_field_path.size() != layer) {
-        return false;
-    }
-
-    if (!_col_type->is_string_type()) {
-        return false;
-    }
-
-    if (_column_all_pages_dict_encoded()) {
-        if (_dict_filter_ctx == nullptr) {
-            _dict_filter_ctx = std::make_unique<ColumnDictFilterContext>();
-            _dict_filter_ctx->is_decode_needed = is_decode_needed;
-            _dict_filter_ctx->sub_field_path = sub_field_path;
-            _dict_filter_ctx->slot_id = slotId;
-        }
-        _dict_filter_ctx->conjunct_ctxs.push_back(ctx);
-        return true;
-    } else {
-        return false;
-    }
-}
-
-Status ScalarColumnReader::fill_dst_column(ColumnPtr& dst, ColumnPtr& src) {
-    if (!_need_lazy_decode) {
-        dst->swap_column(*src);
-    } else {
-        if (_dict_filter_ctx == nullptr || _dict_filter_ctx->is_decode_needed) {
-            ColumnPtr& dict_values = dst;
-            dict_values->reserve(src->size());
-
-            // decode dict code to dict values.
-            // note that in dict code, there could be null value.
-            const ColumnPtr& dict_codes = src;
-            auto* codes_nullable_column = ColumnHelper::as_raw_column<NullableColumn>(dict_codes);
-            auto* codes_column =
-                    ColumnHelper::as_raw_column<FixedLengthColumn<int32_t>>(codes_nullable_column->data_column());
-            RETURN_IF_ERROR(
-                    _reader->get_dict_values(codes_column->get_data(), *codes_nullable_column, dict_values.get()));
-            DCHECK_EQ(dict_codes->size(), dict_values->size());
-            if (dict_values->is_nullable()) {
-                auto* nullable_codes = down_cast<NullableColumn*>(dict_codes.get());
-                auto* nullable_values = down_cast<NullableColumn*>(dict_values.get());
-                nullable_values->null_column_data().swap(nullable_codes->null_column_data());
-                nullable_values->set_has_null(nullable_codes->has_null());
-            }
-        } else {
-            dst->append_default(src->size());
-        }
-
-        src->reset_column();
-        src = _ori_column;
-    }
-    return Status::OK();
-}
-
-bool ScalarColumnReader::_column_all_pages_dict_encoded() {
-    // The Parquet spec allows for column chunks to have mixed encodings
-    // where some data pages are dictionary-encoded and others are plain
-    // encoded. For example, a Parquet file writer might start writing
-    // a column chunk as dictionary encoded, but it will switch to plain
-    // encoding if the dictionary grows too large.
-    //
-    // In order for dictionary filters to skip the entire row group,
-    // the conjuncts must be evaluated on column chunks that are entirely
-    // encoded with the dictionary encoding. There are two checks
-    // available to verify this:
-    // 1. The encoding_stats field on the column chunk metadata provides
-    //    information about the number of data pages written in each
-    //    format. This allows for a specific check of whether all the
-    //    data pages are dictionary encoded.
-    // 2. The encodings field on the column chunk metadata lists the
-    //    encodings used. If this list contains the dictionary encoding
-    //    and does not include unexpected encodings (i.e. encodings not
-    //    associated with definition/repetition levels), then it is entirely
-    //    dictionary encoded.
-    const tparquet::ColumnMetaData& column_metadata = get_chunk_metadata()->meta_data;
-    if (column_metadata.__isset.encoding_stats) {
-        // Condition #1 above
-        for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) {
-            if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
-                (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
-                 enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
-                enc_stat.count > 0) {
-                return false;
-            }
-        }
-    } else {
-        // Condition #2 above
-        bool has_dict_encoding = false;
-        bool has_nondict_encoding = false;
-        for (const tparquet::Encoding::type& encoding : column_metadata.encodings) {
-            if (encoding == tparquet::Encoding::PLAIN_DICTIONARY || encoding == tparquet::Encoding::RLE_DICTIONARY) {
-                has_dict_encoding = true;
-            }
-
-            // RLE and BIT_PACKED are used for repetition/definition levels
-            if (encoding != tparquet::Encoding::PLAIN_DICTIONARY && encoding != tparquet::Encoding::RLE_DICTIONARY &&
-                encoding != tparquet::Encoding::RLE && encoding != tparquet::Encoding::BIT_PACKED) {
-                has_nondict_encoding = true;
-                break;
-            }
-        }
-        // Not entirely dictionary encoded if:
-        // 1. No dictionary encoding listed
-        // OR
-        // 2. Some non-dictionary encoding is listed
-        if (!has_dict_encoding || has_nondict_encoding) {
-            return false;
-        }
-    }
-
-    return true;
-}
+// RawColumnReader
 
-void ScalarColumnReader::collect_column_io_range(std::vector<io::SharedBufferedInputStream::IORange>* ranges,
-                                                 int64_t* end_offset, ColumnIOType type, bool active) {
+void RawColumnReader::collect_column_io_range(std::vector<io::SharedBufferedInputStream::IORange>* ranges,
+                                              int64_t* end_offset, ColumnIOType type, bool active) {
     const auto& column = *get_chunk_metadata();
     if (type == ColumnIOType::PAGES) {
         const tparquet::ColumnMetaData& column_metadata = column.meta_data;
@@ -231,7 +90,7 @@ void ScalarColumnReader::collect_column_io_range(std::vector<io::SharedBufferedI
     }
 }
 
-void ScalarColumnReader::select_offset_index(const SparseRange<uint64_t>& range, const uint64_t rg_first_row) {
+void RawColumnReader::select_offset_index(const SparseRange<uint64_t>& range, const uint64_t rg_first_row) {
     if (_offset_index_ctx == nullptr) {
         if (!get_chunk_metadata()->__isset.offset_index_offset) {
             return;
@@ -277,10 +136,69 @@ void ScalarColumnReader::select_offset_index(const SparseRange<uint64_t>& range,
     _reader = std::make_unique<StoredColumnReaderWithIndex>(std::move(_reader), _offset_index_ctx.get(), has_dict_page);
 }
 
-StatusOr<bool> ScalarColumnReader::row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
-                                                             CompoundNodeType pred_relation,
-                                                             const uint64_t rg_first_row,
-                                                             const uint64_t rg_num_rows) const {
+bool RawColumnReader::column_all_pages_dict_encoded() const {
+    // The Parquet spec allows for column chunks to have mixed encodings
+    // where some data pages are dictionary-encoded and others are plain
+    // encoded. For example, a Parquet file writer might start writing
+    // a column chunk as dictionary encoded, but it will switch to plain
+    // encoding if the dictionary grows too large.
+    //
+    // In order for dictionary filters to skip the entire row group,
+    // the conjuncts must be evaluated on column chunks that are entirely
+    // encoded with the dictionary encoding. There are two checks
+    // available to verify this:
+    // 1. The encoding_stats field on the column chunk metadata provides
+    //    information about the number of data pages written in each
+    //    format. This allows for a specific check of whether all the
+    //    data pages are dictionary encoded.
+    // 2. The encodings field on the column chunk metadata lists the
+    //    encodings used. If this list contains the dictionary encoding
+    //    and does not include unexpected encodings (i.e. encodings not
+    //    associated with definition/repetition levels), then it is entirely
+    //    dictionary encoded.
+    const tparquet::ColumnMetaData& column_metadata = get_chunk_metadata()->meta_data;
+    if (column_metadata.__isset.encoding_stats) {
+        // Condition #1 above
+        for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) {
+            if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
+                (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
+                 enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
+                enc_stat.count > 0) {
+                return false;
+            }
+        }
+    } else {
+        // Condition #2 above
+        bool has_dict_encoding = false;
+        bool has_nondict_encoding = false;
+        for (const tparquet::Encoding::type& encoding : column_metadata.encodings) {
+            if (encoding == tparquet::Encoding::PLAIN_DICTIONARY || encoding == tparquet::Encoding::RLE_DICTIONARY) {
+                has_dict_encoding = true;
+            }
+
+            // RLE and BIT_PACKED are used for repetition/definition levels
+            if (encoding != tparquet::Encoding::PLAIN_DICTIONARY && encoding != tparquet::Encoding::RLE_DICTIONARY &&
+                encoding != tparquet::Encoding::RLE && encoding != tparquet::Encoding::BIT_PACKED) {
+                has_nondict_encoding = true;
+                break;
+            }
+        }
+        // Not entirely dictionary encoded if:
+        // 1. No dictionary encoding listed
+        // OR
+        // 2. Some non-dictionary encoding is listed
+        if (!has_dict_encoding || has_nondict_encoding) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+StatusOr<bool> RawColumnReader::_row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                                           CompoundNodeType pred_relation,
+                                                           const TypeDescriptor& col_type, const uint64_t rg_first_row,
+                                                           const uint64_t rg_num_rows) const {
     if (!get_chunk_metadata()->meta_data.__isset.statistics || get_column_parquet_field() == nullptr) {
         // statistics is not existed, select all
         return true;
@@ -299,8 +217,8 @@ StatusOr<bool> ScalarColumnReader::row_group_zone_map_filter(const std::vector<c
     std::optional<ZoneMapDetail> zone_map_detail = std::nullopt;
 
     // used to hold min/max slice values
-    const ColumnPtr min_column = ColumnHelper::create_column(*_col_type, true);
-    const ColumnPtr max_column = ColumnHelper::create_column(*_col_type, true);
+    const ColumnPtr min_column = ColumnHelper::create_column(col_type, true);
+    const ColumnPtr max_column = ColumnHelper::create_column(col_type, true);
     if (is_all_null) {
         // if the entire column's value is null, the min/max value not existed
         zone_map_detail = ZoneMapDetail{Datum{}, Datum{}, true};
@@ -310,12 +228,12 @@ StatusOr<bool> ScalarColumnReader::row_group_zone_map_filter(const std::vector<c
         std::vector<string> max_values;
         std::vector<bool> null_pages{false};
         Status st =
-                StatisticsHelper::get_min_max_value(_opts.file_meta_data, *_col_type, &get_chunk_metadata()->meta_data,
+                StatisticsHelper::get_min_max_value(_opts.file_meta_data, col_type, &get_chunk_metadata()->meta_data,
                                                     get_column_parquet_field(), min_values, max_values);
         if (st.ok()) {
-            RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, *_col_type,
+            RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(min_column, min_values, null_pages, col_type,
                                                                        get_column_parquet_field(), _opts.timezone));
-            RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, *_col_type,
+            RETURN_IF_ERROR(StatisticsHelper::decode_value_into_column(max_column, max_values, null_pages, col_type,
                                                                        get_column_parquet_field(), _opts.timezone));
 
             zone_map_detail = ZoneMapDetail{min_column->get(0), max_column->get(0), has_null};
@@ -331,10 +249,11 @@ StatusOr<bool> ScalarColumnReader::row_group_zone_map_filter(const std::vector<c
     return ZoneMapEvaluatorUtils::is_satisfy(predicates, zone_map_detail.value(), pred_relation);
 }
 
-StatusOr<bool> ScalarColumnReader::page_index_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
-                                                              SparseRange<uint64_t>* row_ranges,
-                                                              CompoundNodeType pred_relation,
-                                                              const uint64_t rg_first_row, const uint64_t rg_num_rows) {
+StatusOr<bool> RawColumnReader::_page_index_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                                            SparseRange<uint64_t>* row_ranges,
+                                                            CompoundNodeType pred_relation,
+                                                            const TypeDescriptor& col_type, const uint64_t rg_first_row,
+                                                            const uint64_t rg_num_rows) {
     DCHECK(row_ranges->empty());
     const tparquet::ColumnChunk* chunk_meta = get_chunk_metadata();
     if (!chunk_meta->__isset.column_index_offset || !chunk_meta->__isset.offset_index_offset ||
@@ -360,22 +279,22 @@ StatusOr<bool> ScalarColumnReader::page_index_zone_map_filter(const std::vector<
     const size_t page_num = column_index.min_values.size();
     const std::vector<bool> null_pages = column_index.null_pages;
 
-    ColumnPtr min_column = ColumnHelper::create_column(*_col_type, true);
-    ColumnPtr max_column = ColumnHelper::create_column(*_col_type, true);
+    ColumnPtr min_column = ColumnHelper::create_column(col_type, true);
+    ColumnPtr max_column = ColumnHelper::create_column(col_type, true);
     // deal with min_values
-    auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, null_pages, *_col_type,
+    auto st = StatisticsHelper::decode_value_into_column(min_column, column_index.min_values, null_pages, col_type,
                                                          get_column_parquet_field(), _opts.timezone);
     if (!st.ok()) {
         // swallow error status
-        LOG(INFO) << "Error when decode min/max statistics, type " << _col_type->debug_string();
+        LOG(INFO) << "Error when decode min/max statistics, type " << col_type.debug_string();
         return false;
     }
     // deal with max_values
-    st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, null_pages, *_col_type,
+    st = StatisticsHelper::decode_value_into_column(max_column, column_index.max_values, null_pages, col_type,
                                                     get_column_parquet_field(), _opts.timezone);
     if (!st.ok()) {
         // swallow error status
-        LOG(INFO) << "Error when decode min/max statistics, type " << _col_type->debug_string();
+        LOG(INFO) << "Error when decode min/max statistics, type " << col_type.debug_string();
         return false;
     }
 
@@ -420,4 +339,91 @@ StatusOr<bool> ScalarColumnReader::page_index_zone_map_filter(const std::vector<
     return true;
 }
 
+// ScalarColumnReader
+
+Status ScalarColumnReader::read_range(const Range<uint64_t>& range, const Filter* filter, ColumnPtr& dst) {
+    DCHECK(get_column_parquet_field()->is_nullable ? dst->is_nullable() : true);
+    _need_lazy_decode =
+            _dict_filter_ctx != nullptr || (_can_lazy_decode && filter != nullptr &&
+                                            SIMD::count_nonzero(*filter) * 1.0 / filter->size() < FILTER_RATIO);
+    ColumnContentType content_type = !_need_lazy_decode ? ColumnContentType::VALUE : ColumnContentType::DICT_CODE;
+    if (_need_lazy_decode) {
+        if (_dict_code == nullptr) {
+            _dict_code = ColumnHelper::create_column(
+                    TypeDescriptor::from_logical_type(ColumnDictFilterContext::kDictCodePrimitiveType), true);
+        }
+        _ori_column = dst;
+        dst = _dict_code;
+        dst->reserve(range.span_size());
+    }
+    if (!_converter->need_convert) {
+        SCOPED_RAW_TIMER(&_opts.stats->column_read_ns);
+        return _reader->read_range(range, filter, content_type, dst.get());
+    } else {
+        auto column = _converter->create_src_column();
+        {
+            SCOPED_RAW_TIMER(&_opts.stats->column_read_ns);
+            RETURN_IF_ERROR(_reader->read_range(range, filter, content_type, column.get()));
+        }
+        SCOPED_RAW_TIMER(&_opts.stats->column_convert_ns);
+        return _converter->convert(column, dst.get());
+    }
+}
+
+bool ScalarColumnReader::try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId,
+                                                const std::vector<std::string>& sub_field_path, const size_t& layer) {
+    if (sub_field_path.size() != layer) {
+        return false;
+    }
+
+    if (!_col_type->is_string_type()) {
+        return false;
+    }
+
+    if (column_all_pages_dict_encoded()) {
+        if (_dict_filter_ctx == nullptr) {
+            _dict_filter_ctx = std::make_unique<ColumnDictFilterContext>();
+            _dict_filter_ctx->is_decode_needed = is_decode_needed;
+            _dict_filter_ctx->sub_field_path = sub_field_path;
+            _dict_filter_ctx->slot_id = slotId;
+        }
+        _dict_filter_ctx->conjunct_ctxs.push_back(ctx);
+        return true;
+    } else {
+        return false;
+    }
+}
+
+Status ScalarColumnReader::fill_dst_column(ColumnPtr& dst, ColumnPtr& src) {
+    if (!_need_lazy_decode) {
+        dst->swap_column(*src);
+    } else {
+        if (_dict_filter_ctx == nullptr || _dict_filter_ctx->is_decode_needed) {
+            ColumnPtr& dict_values = dst;
+            dict_values->reserve(src->size());
+
+            // decode dict code to dict values.
+            // note that in dict code, there could be null value.
+            const ColumnPtr& dict_codes = src;
+            auto* codes_nullable_column = ColumnHelper::as_raw_column<NullableColumn>(dict_codes);
+            auto* codes_column =
+                    ColumnHelper::as_raw_column<FixedLengthColumn<int32_t>>(codes_nullable_column->data_column());
+            RETURN_IF_ERROR(
+                    _reader->get_dict_values(codes_column->get_data(), *codes_nullable_column, dict_values.get()));
+            DCHECK_EQ(dict_codes->size(), dict_values->size());
+            if (dict_values->is_nullable()) {
+                auto* nullable_codes = down_cast<NullableColumn*>(dict_codes.get());
+                auto* nullable_values = down_cast<NullableColumn*>(dict_values.get());
+                nullable_values->null_column_data().swap(nullable_codes->null_column_data());
+                nullable_values->set_has_null(nullable_codes->has_null());
+            }
+        } else {
+            dst->append_default(src->size());
+        }
+
+        src->reset_column();
+        src = _ori_column;
+    }
+    return Status::OK();
+}
 } // namespace starrocks::parquet
\ No newline at end of file
diff --git a/be/src/formats/parquet/scalar_column_reader.h b/be/src/formats/parquet/scalar_column_reader.h
index 063a0036cd60a..773b341bb1de6 100644
--- a/be/src/formats/parquet/scalar_column_reader.h
+++ b/be/src/formats/parquet/scalar_column_reader.h
@@ -55,48 +55,23 @@ class FixedValueColumnReader final : public ColumnReader {
     const Datum _fixed_value;
 };
 
-class ScalarColumnReader final : public ColumnReader {
+class RawColumnReader : public ColumnReader {
 public:
-    explicit ScalarColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata,
-                                const TypeDescriptor* col_type, const ColumnReaderOptions& opts)
-            : ColumnReader(parquet_field), _opts(opts), _col_type(col_type), _chunk_metadata(column_chunk_metadata) {}
-    ~ScalarColumnReader() override = default;
+    explicit RawColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata,
+                             const ColumnReaderOptions& opts)
+            : ColumnReader(parquet_field), _opts(opts), _chunk_metadata(column_chunk_metadata) {}
+    ~RawColumnReader() override = default;
 
     Status prepare() override {
-        RETURN_IF_ERROR(ColumnConverterFactory::create_converter(*get_column_parquet_field(), *_col_type,
-                                                                 _opts.timezone, &_converter));
         return StoredColumnReader::create(_opts, get_column_parquet_field(), get_chunk_metadata(), &_reader);
     }
 
-    Status read_range(const Range<uint64_t>& range, const Filter* filter, ColumnPtr& dst) override;
-
     void get_levels(level_t** def_levels, level_t** rep_levels, size_t* num_levels) override {
         _reader->get_levels(def_levels, rep_levels, num_levels);
     }
 
     void set_need_parse_levels(bool need_parse_levels) override { _reader->set_need_parse_levels(need_parse_levels); }
 
-    bool try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId,
-                                const std::vector<std::string>& sub_field_path, const size_t& layer) override;
-
-    Status rewrite_conjunct_ctxs_to_predicate(bool* is_group_filtered, const std::vector<std::string>& sub_field_path,
-                                              const size_t& layer) override {
-        DCHECK_EQ(sub_field_path.size(), layer);
-        return _dict_filter_ctx->rewrite_conjunct_ctxs_to_predicate(_reader.get(), is_group_filtered);
-    }
-
-    void set_can_lazy_decode(bool can_lazy_decode) override {
-        _can_lazy_decode = can_lazy_decode && _col_type->is_string_type() && _column_all_pages_dict_encoded();
-    }
-
-    Status filter_dict_column(const ColumnPtr& column, Filter* filter, const std::vector<std::string>& sub_field_path,
-                              const size_t& layer) override {
-        DCHECK_EQ(sub_field_path.size(), layer);
-        return _dict_filter_ctx->predicate->evaluate_and(column.get(), filter->data());
-    }
-
-    Status fill_dst_column(ColumnPtr& dst, ColumnPtr& src) override;
-
     void collect_column_io_range(std::vector<io::SharedBufferedInputStream::IORange>* ranges, int64_t* end_offset,
                                  ColumnIOType type, bool active) override;
 
@@ -121,28 +96,81 @@ class ScalarColumnReader final : public ColumnReader {
 
     void select_offset_index(const SparseRange<uint64_t>& range, const uint64_t rg_first_row) override;
 
-    StatusOr<bool> row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
-                                             CompoundNodeType pred_relation, uint64_t rg_first_row,
-                                             uint64_t rg_num_rows) const override;
+    // Returns true if all of the data pages in the column chunk are dict encoded
+    bool column_all_pages_dict_encoded() const;
 
-    StatusOr<bool> page_index_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
-                                              SparseRange<uint64_t>* row_ranges, CompoundNodeType pred_relation,
-                                              const uint64_t rg_first_row, const uint64_t rg_num_rows) override;
+protected:
+    StatusOr<bool> _row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                              CompoundNodeType pred_relation, const TypeDescriptor& col_type,
+                                              uint64_t rg_first_row, uint64_t rg_num_rows) const;
 
-private:
-    // Returns true if all of the data pages in the column chunk are dict encoded
-    bool _column_all_pages_dict_encoded();
+    StatusOr<bool> _page_index_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                               SparseRange<uint64_t>* row_ranges, CompoundNodeType pred_relation,
+                                               const TypeDescriptor& col_type, const uint64_t rg_first_row,
+                                               const uint64_t rg_num_rows);
 
     const ColumnReaderOptions& _opts;
 
     std::unique_ptr<StoredColumnReader> _reader;
 
+    const tparquet::ColumnChunk* _chunk_metadata = nullptr;
+    std::unique_ptr<ColumnOffsetIndexCtx> _offset_index_ctx;
+};
+
+class ScalarColumnReader final : public RawColumnReader {
+public:
+    explicit ScalarColumnReader(const ParquetField* parquet_field, const tparquet::ColumnChunk* column_chunk_metadata,
+                                const TypeDescriptor* col_type, const ColumnReaderOptions& opts)
+            : RawColumnReader(parquet_field, column_chunk_metadata, opts), _col_type(col_type) {}
+    ~ScalarColumnReader() override = default;
+
+    Status prepare() override {
+        RETURN_IF_ERROR(ColumnConverterFactory::create_converter(*get_column_parquet_field(), *_col_type,
+                                                                 _opts.timezone, &_converter));
+        return RawColumnReader::prepare();
+    }
+
+    Status read_range(const Range<uint64_t>& range, const Filter* filter, ColumnPtr& dst) override;
+
+    bool try_to_use_dict_filter(ExprContext* ctx, bool is_decode_needed, const SlotId slotId,
+                                const std::vector<std::string>& sub_field_path, const size_t& layer) override;
+
+    Status rewrite_conjunct_ctxs_to_predicate(bool* is_group_filtered, const std::vector<std::string>& sub_field_path,
+                                              const size_t& layer) override {
+        DCHECK_EQ(sub_field_path.size(), layer);
+        return _dict_filter_ctx->rewrite_conjunct_ctxs_to_predicate(_reader.get(), is_group_filtered);
+    }
+
+    void set_can_lazy_decode(bool can_lazy_decode) override {
+        _can_lazy_decode = can_lazy_decode && _col_type->is_string_type() && column_all_pages_dict_encoded();
+    }
+
+    Status filter_dict_column(const ColumnPtr& column, Filter* filter, const std::vector<std::string>& sub_field_path,
+                              const size_t& layer) override {
+        DCHECK_EQ(sub_field_path.size(), layer);
+        return _dict_filter_ctx->predicate->evaluate_and(column.get(), filter->data());
+    }
+
+    Status fill_dst_column(ColumnPtr& dst, ColumnPtr& src) override;
+
+    StatusOr<bool> row_group_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                             CompoundNodeType pred_relation, const uint64_t rg_first_row,
+                                             const uint64_t rg_num_rows) const override {
+        return _row_group_zone_map_filter(predicates, pred_relation, *_col_type, rg_first_row, rg_num_rows);
+    }
+
+    StatusOr<bool> page_index_zone_map_filter(const std::vector<const ColumnPredicate*>& predicates,
+                                              SparseRange<uint64_t>* row_ranges, CompoundNodeType pred_relation,
+                                              const uint64_t rg_first_row, const uint64_t rg_num_rows) override {
+        return _page_index_zone_map_filter(predicates, row_ranges, pred_relation, *_col_type, rg_first_row,
+                                           rg_num_rows);
+    }
+
+private:
     std::unique_ptr<ColumnConverter> _converter;
 
     std::unique_ptr<ColumnDictFilterContext> _dict_filter_ctx;
     const TypeDescriptor* _col_type = nullptr;
-    const tparquet::ColumnChunk* _chunk_metadata = nullptr;
-    std::unique_ptr<ColumnOffsetIndexCtx> _offset_index_ctx;
 
     // _can_lazy_decode means string type and all page dict code
     bool _can_lazy_decode = false;