From 0b268b28722d5f6c70a984424f9b23dfc8f981bc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 24 Oct 2023 12:33:56 +0200
Subject: [PATCH 01/32] Bump actions/setup-node from 3 to 4 (#7915)

Bumps [actions/setup-node](https://github.com/actions/setup-node) from 3 to 4.
- [Release notes](https://github.com/actions/setup-node/releases)
- [Commits](https://github.com/actions/setup-node/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/setup-node
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/dev.yml  | 2 +-
 .github/workflows/rust.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 15fbbfca0f65..1f5088a1e6ce 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -41,7 +41,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
         with:
           node-version: "14"
       - name: Prettier check
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 49ce70a5c0bc..55f6cecf54aa 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -525,7 +525,7 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
         with:
           node-version: "14"
       - name: Check if configs.md has been modified

From 9e848bf0790aea2323b936e9298af8bcc7e05c85 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:58:01 +0300
Subject: [PATCH 02/32] Fix bug, first last reverse (#7914)

---
 .../physical-expr/src/aggregate/first_last.rs |  8 +++--
 .../physical-expr/src/aggregate/utils.rs      | 12 +++----
 .../sqllogictest/test_files/groupby.slt       | 32 +++++++++++++++++++
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/datafusion/physical-expr/src/aggregate/first_last.rs b/datafusion/physical-expr/src/aggregate/first_last.rs
index 6ae7b4895ad6..ce7a1daeec64 100644
--- a/datafusion/physical-expr/src/aggregate/first_last.rs
+++ b/datafusion/physical-expr/src/aggregate/first_last.rs
@@ -22,7 +22,9 @@ use std::sync::Arc;
 
 use crate::aggregate::utils::{down_cast_any_ref, ordering_fields};
 use crate::expressions::format_state_name;
-use crate::{AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr};
+use crate::{
+    reverse_order_bys, AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr,
+};
 
 use arrow::array::ArrayRef;
 use arrow::compute;
@@ -126,7 +128,7 @@ impl AggregateExpr for FirstValue {
             self.expr.clone(),
             name,
             self.input_data_type.clone(),
-            self.ordering_req.clone(),
+            reverse_order_bys(&self.ordering_req),
             self.order_by_data_types.clone(),
         )))
     }
@@ -350,7 +352,7 @@ impl AggregateExpr for LastValue {
             self.expr.clone(),
             name,
             self.input_data_type.clone(),
-            self.ordering_req.clone(),
+            reverse_order_bys(&self.ordering_req),
             self.order_by_data_types.clone(),
         )))
     }
diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs
index 420b26eb2d8e..da3a52713231 100644
--- a/datafusion/physical-expr/src/aggregate/utils.rs
+++ b/datafusion/physical-expr/src/aggregate/utils.rs
@@ -177,14 +177,10 @@ pub fn adjust_output_array(
 /// for [`AggregateExpr`] aggregation expressions and allows comparing the equality
 /// between the trait objects.
 pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
-    if any.is::<Arc<dyn AggregateExpr>>() {
-        any.downcast_ref::<Arc<dyn AggregateExpr>>()
-            .unwrap()
-            .as_any()
-    } else if any.is::<Box<dyn AggregateExpr>>() {
-        any.downcast_ref::<Box<dyn AggregateExpr>>()
-            .unwrap()
-            .as_any()
+    if let Some(obj) = any.downcast_ref::<Arc<dyn AggregateExpr>>() {
+        obj.as_any()
+    } else if let Some(obj) = any.downcast_ref::<Box<dyn AggregateExpr>>() {
+        obj.as_any()
     } else {
         any
     }
diff --git a/datafusion/sqllogictest/test_files/groupby.slt b/datafusion/sqllogictest/test_files/groupby.slt
index bf93c6633bfc..5cb3ac2f8135 100644
--- a/datafusion/sqllogictest/test_files/groupby.slt
+++ b/datafusion/sqllogictest/test_files/groupby.slt
@@ -3613,6 +3613,38 @@ AggregateExec: mode=Final, gby=[], aggr=[FIRST_VALUE(foo.x)]
 ------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
 --------MemoryExec: partitions=1, partition_sizes=[1]
 
+# Since both ordering requirements are satisfied, there shouldn't be
+# any SortExec in the final plan.
+query TT
+EXPLAIN SELECT FIRST_VALUE(a ORDER BY a ASC) as first_a,
+  LAST_VALUE(c ORDER BY c DESC) as last_c
+FROM multiple_ordered_table
+GROUP BY d;
+----
+logical_plan
+Projection: FIRST_VALUE(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST] AS first_a, LAST_VALUE(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] AS last_c
+--Aggregate: groupBy=[[multiple_ordered_table.d]], aggr=[[FIRST_VALUE(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], LAST_VALUE(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]]]
+----TableScan: multiple_ordered_table projection=[a, c, d]
+physical_plan
+ProjectionExec: expr=[FIRST_VALUE(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST]@1 as first_a, LAST_VALUE(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]@2 as last_c]
+--AggregateExec: mode=FinalPartitioned, gby=[d@0 as d], aggr=[FIRST_VALUE(multiple_ordered_table.a), FIRST_VALUE(multiple_ordered_table.c)]
+----CoalesceBatchesExec: target_batch_size=2
+------RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8
+--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[FIRST_VALUE(multiple_ordered_table.a), FIRST_VALUE(multiple_ordered_table.c)]
+----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true
+
+query II rowsort
+SELECT FIRST_VALUE(a ORDER BY a ASC) as first_a,
+  LAST_VALUE(c ORDER BY c DESC) as last_c
+FROM multiple_ordered_table
+GROUP BY d;
+----
+0 0
+0 1
+0 15
+0 4
+0 9
 
 query TT
 EXPLAIN SELECT c

From ba50a8b178eece7e79b100d0b73bdc9d6d3ec6d5 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 24 Oct 2023 07:02:49 -0400
Subject: [PATCH 03/32] Minor: provide default implementation for
 ExecutionPlan::statistics (#7911)

* Minor: provide default implementation for ExecutionPlan::statistics

* fix: update statistics
---
 datafusion-examples/examples/custom_datasource.rs |  6 +-----
 datafusion/core/src/physical_planner.rs           |  8 +-------
 datafusion/core/src/test_util/mod.rs              |  6 +-----
 datafusion/physical-plan/src/analyze.rs           |  7 +------
 datafusion/physical-plan/src/explain.rs           |  7 +------
 datafusion/physical-plan/src/insert.rs            |  5 -----
 datafusion/physical-plan/src/lib.rs               |  8 ++++++--
 datafusion/physical-plan/src/streaming.rs         |  6 +-----
 datafusion/physical-plan/src/test/exec.rs         | 12 ------------
 datafusion/physical-plan/src/unnest.rs            |  6 +-----
 10 files changed, 13 insertions(+), 58 deletions(-)

diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs
index dd36665a9344..9f25a0b2fa47 100644
--- a/datafusion-examples/examples/custom_datasource.rs
+++ b/datafusion-examples/examples/custom_datasource.rs
@@ -32,7 +32,7 @@ use datafusion::physical_plan::expressions::PhysicalSortExpr;
 use datafusion::physical_plan::memory::MemoryStream;
 use datafusion::physical_plan::{
     project_schema, DisplayAs, DisplayFormatType, ExecutionPlan,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream,
 };
 use datafusion::prelude::*;
 use datafusion_expr::{Expr, LogicalPlanBuilder};
@@ -270,8 +270,4 @@ impl ExecutionPlan for CustomExec {
             None,
         )?))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index 5a1fdcaee509..419f62cff664 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -2057,9 +2057,7 @@ mod tests {
     use super::*;
     use crate::datasource::file_format::options::CsvReadOptions;
     use crate::datasource::MemTable;
-    use crate::physical_plan::{
-        expressions, DisplayFormatType, Partitioning, Statistics,
-    };
+    use crate::physical_plan::{expressions, DisplayFormatType, Partitioning};
     use crate::physical_plan::{DisplayAs, SendableRecordBatchStream};
     use crate::physical_planner::PhysicalPlanner;
     use crate::prelude::{SessionConfig, SessionContext};
@@ -2670,10 +2668,6 @@ mod tests {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!("NoOpExecutionPlan::execute");
         }
-
-        fn statistics(&self) -> Result<Statistics> {
-            unimplemented!("NoOpExecutionPlan::statistics");
-        }
     }
 
     //  Produces an execution plan where the schema is mismatched from
diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs
index d826ec8bfbb6..4fe022f1769d 100644
--- a/datafusion/core/src/test_util/mod.rs
+++ b/datafusion/core/src/test_util/mod.rs
@@ -40,7 +40,7 @@ use crate::prelude::{CsvReadOptions, SessionContext};
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{Statistics, TableReference};
+use datafusion_common::TableReference;
 use datafusion_expr::{CreateExternalTable, Expr, TableType};
 use datafusion_physical_expr::PhysicalSortExpr;
 
@@ -238,10 +238,6 @@ impl ExecutionPlan for UnboundedExec {
             batch: self.batch.clone(),
         }))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 #[derive(Debug)]
diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs
index bce242513559..ded37983bb21 100644
--- a/datafusion/physical-plan/src/analyze.rs
+++ b/datafusion/physical-plan/src/analyze.rs
@@ -25,7 +25,7 @@ use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
 use super::{DisplayAs, Distribution, SendableRecordBatchStream};
 
 use crate::display::DisplayableExecutionPlan;
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning, Statistics};
+use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::{internal_err, DataFusionError, Result};
@@ -195,11 +195,6 @@ impl ExecutionPlan for AnalyzeExec {
             futures::stream::once(output),
         )))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        // Statistics an an ANALYZE plan are not relevant
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 /// Creates the ouput of AnalyzeExec as a RecordBatch
diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs
index 81b8f9944110..e4904ddd3410 100644
--- a/datafusion/physical-plan/src/explain.rs
+++ b/datafusion/physical-plan/src/explain.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 use super::expressions::PhysicalSortExpr;
 use super::{DisplayAs, SendableRecordBatchStream};
 use crate::stream::RecordBatchStreamAdapter;
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning, Statistics};
+use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::display::StringifiedPlan;
@@ -167,11 +167,6 @@ impl ExecutionPlan for ExplainExec {
             futures::stream::iter(vec![Ok(record_batch)]),
         )))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        // Statistics an EXPLAIN plan are not relevant
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 /// If this plan should be shown, given the previous plan that was
diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs
index d1f2706930d2..627d58e13781 100644
--- a/datafusion/physical-plan/src/insert.rs
+++ b/datafusion/physical-plan/src/insert.rs
@@ -25,7 +25,6 @@ use std::sync::Arc;
 use super::expressions::PhysicalSortExpr;
 use super::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream,
-    Statistics,
 };
 use crate::metrics::MetricsSet;
 use crate::stream::RecordBatchStreamAdapter;
@@ -276,10 +275,6 @@ impl ExecutionPlan for FileSinkExec {
             stream,
         )))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 /// Create a output record batch with a count
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index d7987ba95abf..b2f81579f8e8 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -231,8 +231,12 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
         None
     }
 
-    /// Returns the global output statistics for this `ExecutionPlan` node.
-    fn statistics(&self) -> Result<Statistics>;
+    /// Returns statistics for this `ExecutionPlan` node. If statistics are not
+    /// available, should return [`Statistics::new_unknown`] (the default), not
+    /// an error.
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
 }
 
 /// Indicate whether a data exchange is needed for the input of `plan`, which will be very helpful
diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs
index 7bfa7e2ceefb..27f03b727c29 100644
--- a/datafusion/physical-plan/src/streaming.rs
+++ b/datafusion/physical-plan/src/streaming.rs
@@ -26,7 +26,7 @@ use crate::stream::RecordBatchStreamAdapter;
 use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream};
 
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, plan_err, DataFusionError, Result, Statistics};
+use datafusion_common::{internal_err, plan_err, DataFusionError, Result};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
 
@@ -187,8 +187,4 @@ impl ExecutionPlan for StreamingTableExec {
             None => stream,
         })
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs
index f90f4231c620..71e6cba6741e 100644
--- a/datafusion/physical-plan/src/test/exec.rs
+++ b/datafusion/physical-plan/src/test/exec.rs
@@ -453,10 +453,6 @@ impl ExecutionPlan for ErrorExec {
     ) -> Result<SendableRecordBatchStream> {
         internal_err!("ErrorExec, unsurprisingly, errored in partition {partition}")
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 /// A mock execution plan that simply returns the provided statistics
@@ -627,10 +623,6 @@ impl ExecutionPlan for BlockingExec {
             _refs: Arc::clone(&self.refs),
         }))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        unimplemented!()
-    }
 }
 
 /// A [`RecordBatchStream`] that is pending forever.
@@ -764,10 +756,6 @@ impl ExecutionPlan for PanicExec {
             ready: false,
         }))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        unimplemented!()
-    }
 }
 
 /// A [`RecordBatchStream`] that yields every other batch and panics
diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs
index ed64735e5ad6..30f109953cbb 100644
--- a/datafusion/physical-plan/src/unnest.rs
+++ b/datafusion/physical-plan/src/unnest.rs
@@ -25,7 +25,7 @@ use super::DisplayAs;
 use crate::{
     expressions::Column, DisplayFormatType, Distribution, EquivalenceProperties,
     ExecutionPlan, Partitioning, PhysicalExpr, PhysicalSortExpr, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream,
 };
 
 use arrow::array::{
@@ -159,10 +159,6 @@ impl ExecutionPlan for UnnestExec {
             unnest_time: 0,
         }))
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 /// A stream that issues [RecordBatch]es with unnested column data.

From ea5fadea091602ce745aacba7e6197fd6cfc45d9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 24 Oct 2023 11:35:30 -0400
Subject: [PATCH 04/32] Update substrait requirement from 0.17.0 to 0.18.0
 (#7916)

Updates the requirements on [substrait](https://github.com/substrait-io/substrait-rs) to permit the latest version.
- [Release notes](https://github.com/substrait-io/substrait-rs/releases)
- [Changelog](https://github.com/substrait-io/substrait-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/substrait-io/substrait-rs/compare/v0.17.0...v0.18.0)

---
updated-dependencies:
- dependency-name: substrait
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 datafusion/substrait/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml
index 44a13cdcac37..7c4ff868cfcd 100644
--- a/datafusion/substrait/Cargo.toml
+++ b/datafusion/substrait/Cargo.toml
@@ -35,7 +35,7 @@ itertools = "0.11"
 object_store = "0.7.0"
 prost = "0.12"
 prost-types = "0.12"
-substrait = "0.17.0"
+substrait = "0.18.0"
 tokio = "1.17"
 
 [features]

From 08c1b69a4ec9e5ddeaefd2810c0624ce9d9a3b9b Mon Sep 17 00:00:00 2001
From: Chih Wang <ongchi@users.noreply.github.com>
Date: Tue, 24 Oct 2023 23:37:08 +0800
Subject: [PATCH 05/32] Remove unnecessary clone in datafusion_proto (#7921)

---
 datafusion/proto/src/logical_plan/mod.rs  |  8 ++++----
 datafusion/proto/src/physical_plan/mod.rs | 19 +++++++------------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index f0999871f568..df76fbb81396 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -363,7 +363,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     .collect::<Result<Vec<_>, _>>()?;
 
                 let options = ListingOptions::new(file_format)
-                    .with_file_extension(scan.file_extension.clone())
+                    .with_file_extension(&scan.file_extension)
                     .with_table_partition_cols(
                         scan.table_partition_cols
                             .iter()
@@ -458,7 +458,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                 let input: LogicalPlan =
                     into_logical_plan!(repartition.input, ctx, extension_codec)?;
                 use protobuf::repartition_node::PartitionMethod;
-                let pb_partition_method = repartition.partition_method.clone().ok_or_else(|| {
+                let pb_partition_method = repartition.partition_method.as_ref().ok_or_else(|| {
                     DataFusionError::Internal(String::from(
                         "Protobuf deserialization error, RepartitionNode was missing required field 'partition_method'",
                     ))
@@ -473,10 +473,10 @@ impl AsLogicalPlan for LogicalPlanNode {
                             .iter()
                             .map(|expr| from_proto::parse_expr(expr, ctx))
                             .collect::<Result<Vec<_>, _>>()?,
-                        partition_count as usize,
+                        *partition_count as usize,
                     ),
                     PartitionMethod::RoundRobin(partition_count) => {
-                        Partitioning::RoundRobinBatch(partition_count as usize)
+                        Partitioning::RoundRobinBatch(*partition_count as usize)
                     }
                 };
 
diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs
index 08010a3151ee..ef870d8ac20b 100644
--- a/datafusion/proto/src/physical_plan/mod.rs
+++ b/datafusion/proto/src/physical_plan/mod.rs
@@ -394,17 +394,12 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     vec![]
                 };
 
-                let input_schema = hash_agg
-                    .input_schema
-                    .as_ref()
-                    .ok_or_else(|| {
-                        DataFusionError::Internal(
-                            "input_schema in AggregateNode is missing.".to_owned(),
-                        )
-                    })?
-                    .clone();
-                let physical_schema: SchemaRef =
-                    SchemaRef::new((&input_schema).try_into()?);
+                let input_schema = hash_agg.input_schema.as_ref().ok_or_else(|| {
+                    DataFusionError::Internal(
+                        "input_schema in AggregateNode is missing.".to_owned(),
+                    )
+                })?;
+                let physical_schema: SchemaRef = SchemaRef::new(input_schema.try_into()?);
 
                 let physical_filter_expr = hash_agg
                     .filter_expr
@@ -489,7 +484,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     physical_filter_expr,
                     physical_order_by_expr,
                     input,
-                    Arc::new((&input_schema).try_into()?),
+                    Arc::new(input_schema.try_into()?),
                 )?))
             }
             PhysicalPlanType::HashJoin(hashjoin) => {

From 10eabd6822821d537e1148d63e44919607d46b87 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Tue, 24 Oct 2023 23:14:57 +0300
Subject: [PATCH 06/32] [MINOR]: Simplify code, change requirement from
 PhysicalSortExpr to PhysicalSortRequirement (#7913)

* simplify code, change requirement from PhysicalSortExpr to PhysicalSortRequirement

* Remove unnecessary result
---
 .../enforce_distribution.rs                   | 11 ++-----
 .../src/physical_optimizer/enforce_sorting.rs | 16 +++++-----
 .../replace_with_order_preserving_variants.rs | 18 +++++------
 .../src/physical_optimizer/sort_pushdown.rs   | 31 +++++++++----------
 .../core/src/physical_optimizer/utils.rs      | 15 +++++----
 5 files changed, 41 insertions(+), 50 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index 89036e9f8ccc..9cd7eff4722b 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -55,9 +55,7 @@ use datafusion_physical_expr::expressions::{Column, NoOp};
 use datafusion_physical_expr::utils::{
     map_columns_before_projection, ordering_satisfy_requirement_concrete,
 };
-use datafusion_physical_expr::{
-    expr_list_eq_strict_order, PhysicalExpr, PhysicalSortRequirement,
-};
+use datafusion_physical_expr::{expr_list_eq_strict_order, PhysicalExpr};
 use datafusion_physical_plan::unbounded_output;
 use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec};
 
@@ -1374,10 +1372,7 @@ fn ensure_distribution(
                     // make sure ordering requirements are still satisfied after.
                     if ordering_satisfied {
                         // Make sure to satisfy ordering requirement:
-                        let sort_expr = PhysicalSortRequirement::to_sort_exprs(
-                            required_input_ordering.clone(),
-                        );
-                        add_sort_above(&mut child, sort_expr, None)?;
+                        add_sort_above(&mut child, required_input_ordering, None);
                     }
                 }
                 // Stop tracking distribution changing operators
@@ -1715,7 +1710,7 @@ mod tests {
     use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
     use datafusion_physical_expr::{
         expressions, expressions::binary, expressions::lit, expressions::Column,
-        LexOrdering, PhysicalExpr, PhysicalSortExpr,
+        LexOrdering, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
     };
 
     /// Models operators like BoundedWindowExec that require an input
diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 92db3bbd053e..913dae07faa1 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -399,7 +399,11 @@ fn parallelize_sorts(
         let mut prev_layer = plan.clone();
         update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?;
         let (sort_exprs, fetch) = get_sort_exprs(&plan)?;
-        add_sort_above(&mut prev_layer, sort_exprs.to_vec(), fetch)?;
+        add_sort_above(
+            &mut prev_layer,
+            &PhysicalSortRequirement::from_sort_exprs(sort_exprs),
+            fetch,
+        );
         let spm = SortPreservingMergeExec::new(sort_exprs.to_vec(), prev_layer)
             .with_fetch(fetch);
         return Ok(Transformed::Yes(PlanWithCorrespondingCoalescePartitions {
@@ -456,9 +460,7 @@ fn ensure_sorting(
                 ) {
                     // Make sure we preserve the ordering requirements:
                     update_child_to_remove_unnecessary_sort(child, sort_onwards, &plan)?;
-                    let sort_expr =
-                        PhysicalSortRequirement::to_sort_exprs(required_ordering);
-                    add_sort_above(child, sort_expr, None)?;
+                    add_sort_above(child, &required_ordering, None);
                     if is_sort(child) {
                         *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![]));
                     } else {
@@ -468,8 +470,7 @@ fn ensure_sorting(
             }
             (Some(required), None) => {
                 // Ordering requirement is not met, we should add a `SortExec` to the plan.
-                let sort_expr = PhysicalSortRequirement::to_sort_exprs(required);
-                add_sort_above(child, sort_expr, None)?;
+                add_sort_above(child, &required, None);
                 *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![]));
             }
             (None, Some(_)) => {
@@ -603,9 +604,8 @@ fn analyze_window_sort_removal(
             .required_input_ordering()
             .swap_remove(0)
             .unwrap_or_default();
-        let sort_expr = PhysicalSortRequirement::to_sort_exprs(reqs);
         // Satisfy the ordering requirement so that the window can run:
-        add_sort_above(&mut window_child, sort_expr, None)?;
+        add_sort_above(&mut window_child, &reqs, None);
 
         let uses_bounded_memory = window_expr.iter().all(|e| e.uses_bounded_memory());
         let new_window = if uses_bounded_memory {
diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
index fb1a50e18d6f..fb75c083a70a 100644
--- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
+++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
@@ -71,14 +71,15 @@ impl OrderPreservationContext {
                 // ordering, (or that can maintain ordering with the replacement of
                 // its variant)
                 let plan = item.plan;
+                let children = plan.children();
                 let ordering_onwards = item.ordering_onwards;
-                if plan.children().is_empty() {
+                if children.is_empty() {
                     // Plan has no children, there is nothing to propagate.
                     None
                 } else if ordering_onwards[0].is_none()
                     && ((is_repartition(&plan) && !plan.maintains_input_order()[0])
                         || (is_coalesce_partitions(&plan)
-                            && plan.children()[0].output_ordering().is_some()))
+                            && children[0].output_ordering().is_some()))
                 {
                     Some(ExecTree::new(plan, idx, vec![]))
                 } else {
@@ -175,19 +176,18 @@ fn get_updated_plan(
     // When a `RepartitionExec` doesn't preserve ordering, replace it with
     // a `SortPreservingRepartitionExec` if appropriate:
     if is_repartition(&plan) && !plan.maintains_input_order()[0] && is_spr_better {
-        let child = plan.children()[0].clone();
-        plan = Arc::new(
-            RepartitionExec::try_new(child, plan.output_partitioning())?
-                .with_preserve_order(true),
-        ) as _
+        let child = plan.children().swap_remove(0);
+        let repartition = RepartitionExec::try_new(child, plan.output_partitioning())?;
+        plan = Arc::new(repartition.with_preserve_order(true)) as _
     }
     // When the input of a `CoalescePartitionsExec` has an ordering, replace it
     // with a `SortPreservingMergeExec` if appropriate:
+    let mut children = plan.children();
     if is_coalesce_partitions(&plan)
-        && plan.children()[0].output_ordering().is_some()
+        && children[0].output_ordering().is_some()
         && is_spm_better
     {
-        let child = plan.children()[0].clone();
+        let child = children.swap_remove(0);
         plan = Arc::new(SortPreservingMergeExec::new(
             child.output_ordering().unwrap_or(&[]).to_vec(),
             child,
diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
index 9b81ad3efb50..808c4a3dadae 100644
--- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
@@ -139,23 +139,21 @@ pub(crate) fn pushdown_sorts(
             || plan.ordering_equivalence_properties(),
         ) {
             // If the current plan is a SortExec, modify it to satisfy parent requirements:
-            let parent_required_expr = PhysicalSortRequirement::to_sort_exprs(
-                parent_required.ok_or_else(err)?.iter().cloned(),
-            );
+            let parent_required_expr = parent_required.ok_or_else(err)?;
             new_plan = sort_exec.input().clone();
-            add_sort_above(&mut new_plan, parent_required_expr, sort_exec.fetch())?;
+            add_sort_above(&mut new_plan, parent_required_expr, sort_exec.fetch());
         };
         let required_ordering = new_plan
             .output_ordering()
             .map(PhysicalSortRequirement::from_sort_exprs);
         // Since new_plan is a SortExec, we can safely get the 0th index.
-        let child = &new_plan.children()[0];
+        let child = new_plan.children().swap_remove(0);
         if let Some(adjusted) =
-            pushdown_requirement_to_children(child, required_ordering.as_deref())?
+            pushdown_requirement_to_children(&child, required_ordering.as_deref())?
         {
             // Can push down requirements
             Ok(Transformed::Yes(SortPushDown {
-                plan: child.clone(),
+                plan: child,
                 required_ordering: None,
                 adjusted_request_ordering: adjusted,
             }))
@@ -180,17 +178,15 @@ pub(crate) fn pushdown_sorts(
         // Can not satisfy the parent requirements, check whether the requirements can be pushed down:
         if let Some(adjusted) = pushdown_requirement_to_children(plan, parent_required)? {
             Ok(Transformed::Yes(SortPushDown {
-                plan: plan.clone(),
+                plan: requirements.plan,
                 required_ordering: None,
                 adjusted_request_ordering: adjusted,
             }))
         } else {
             // Can not push down requirements, add new SortExec:
-            let parent_required_expr = PhysicalSortRequirement::to_sort_exprs(
-                parent_required.ok_or_else(err)?.iter().cloned(),
-            );
-            let mut new_plan = plan.clone();
-            add_sort_above(&mut new_plan, parent_required_expr, None)?;
+            let parent_required_expr = parent_required.ok_or_else(err)?;
+            let mut new_plan = requirements.plan;
+            add_sort_above(&mut new_plan, parent_required_expr, None);
             Ok(Transformed::Yes(SortPushDown::init(new_plan)))
         }
     }
@@ -206,7 +202,7 @@ fn pushdown_requirement_to_children(
     if is_window(plan) {
         let required_input_ordering = plan.required_input_ordering();
         let request_child = required_input_ordering[0].as_deref();
-        let child_plan = plan.children()[0].clone();
+        let child_plan = plan.children().swap_remove(0);
         match determine_children_requirement(parent_required, request_child, child_plan) {
             RequirementsCompatibility::Satisfy => {
                 Ok(Some(vec![request_child.map(|r| r.to_vec())]))
@@ -355,16 +351,17 @@ fn try_pushdown_requirements_to_join(
         || smj.ordering_equivalence_properties(),
     )
     .then(|| {
-        let required_input_ordering = smj.required_input_ordering();
+        let mut required_input_ordering = smj.required_input_ordering();
         let new_req = Some(PhysicalSortRequirement::from_sort_exprs(&sort_expr));
         match push_side {
             JoinSide::Left => {
-                vec![new_req, required_input_ordering[1].clone()]
+                required_input_ordering[0] = new_req;
             }
             JoinSide::Right => {
-                vec![required_input_ordering[0].clone(), new_req]
+                required_input_ordering[1] = new_req;
             }
         }
+        required_input_ordering
     }))
 }
 
diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs
index 0d6c85f9f22b..403af4b16ec7 100644
--- a/datafusion/core/src/physical_optimizer/utils.rs
+++ b/datafusion/core/src/physical_optimizer/utils.rs
@@ -21,7 +21,6 @@ use std::fmt;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
-use crate::error::Result;
 use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use crate::physical_plan::repartition::RepartitionExec;
@@ -31,8 +30,8 @@ use crate::physical_plan::union::UnionExec;
 use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
 use crate::physical_plan::{displayable, ExecutionPlan};
 
-use datafusion_physical_expr::utils::ordering_satisfy;
-use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr::utils::ordering_satisfy_requirement;
+use datafusion_physical_expr::PhysicalSortRequirement;
 
 /// This object implements a tree that we use while keeping track of paths
 /// leading to [`SortExec`]s.
@@ -101,16 +100,17 @@ pub(crate) fn get_children_exectrees(
 /// given ordering requirements while preserving the original partitioning.
 pub fn add_sort_above(
     node: &mut Arc<dyn ExecutionPlan>,
-    sort_expr: Vec<PhysicalSortExpr>,
+    sort_requirement: &[PhysicalSortRequirement],
     fetch: Option<usize>,
-) -> Result<()> {
+) {
     // If the ordering requirement is already satisfied, do not add a sort.
-    if !ordering_satisfy(
+    if !ordering_satisfy_requirement(
         node.output_ordering(),
-        Some(&sort_expr),
+        Some(sort_requirement),
         || node.equivalence_properties(),
         || node.ordering_equivalence_properties(),
     ) {
+        let sort_expr = PhysicalSortRequirement::to_sort_exprs(sort_requirement.to_vec());
         let new_sort = SortExec::new(sort_expr, node.clone()).with_fetch(fetch);
 
         *node = Arc::new(if node.output_partitioning().partition_count() > 1 {
@@ -119,7 +119,6 @@ pub fn add_sort_above(
             new_sort
         }) as _
     }
-    Ok(())
 }
 
 /// Checks whether the given operator is a limit;

From 8068d7f0779bc9d5fddff35de33df8779f67cd0c Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Tue, 24 Oct 2023 23:42:33 +0300
Subject: [PATCH 07/32] [Minor] Move combine_join util to under equivalence.rs
 (#7917)

* Move combine join to equivalences.rs file

* Resolve linter errors

* Simplifications, do not return Error in add_offset
---
 .../src/physical_optimizer/sort_pushdown.rs   |   2 +-
 datafusion/physical-expr/src/equivalence.rs   | 338 +++++++++++++++++-
 datafusion/physical-expr/src/lib.rs           |   8 +-
 .../physical-plan/src/joins/cross_join.rs     |   4 +-
 .../physical-plan/src/joins/hash_join.rs      |  19 +-
 .../src/joins/nested_loop_join.rs             |  10 +-
 .../src/joins/sort_merge_join.rs              |  11 +-
 .../src/joins/symmetric_hash_join.rs          |   5 +-
 datafusion/physical-plan/src/joins/utils.rs   | 338 +-----------------
 9 files changed, 368 insertions(+), 367 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
index 808c4a3dadae..a99399592f15 100644
--- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
@@ -343,7 +343,7 @@ fn try_pushdown_requirements_to_join(
         smj.left().schema().fields.len(),
         &smj.maintains_input_order(),
         Some(SortMergeJoinExec::probe_side(&smj.join_type())),
-    )?;
+    );
     Ok(ordering_satisfy_requirement(
         new_output_ordering.as_deref(),
         parent_required,
diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs
index 369c139aa30b..4fce6854138d 100644
--- a/datafusion/physical-expr/src/equivalence.rs
+++ b/datafusion/physical-expr/src/equivalence.rs
@@ -26,7 +26,7 @@ use arrow::datatypes::SchemaRef;
 use arrow_schema::Fields;
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::Result;
+use datafusion_common::{JoinSide, JoinType};
 use itertools::izip;
 use std::collections::{HashMap, HashSet};
 use std::hash::Hash;
@@ -463,14 +463,14 @@ impl OrderingEquivalentClass {
     }
 
     /// Adds `offset` value to the index of each expression inside `self.head` and `self.others`.
-    pub fn add_offset(&self, offset: usize) -> Result<OrderingEquivalentClass> {
-        let head = add_offset_to_lex_ordering(self.head(), offset)?;
+    pub fn add_offset(&self, offset: usize) -> OrderingEquivalentClass {
+        let head = add_offset_to_lex_ordering(self.head(), offset);
         let others = self
             .others()
             .iter()
             .map(|ordering| add_offset_to_lex_ordering(ordering, offset))
-            .collect::<Result<Vec<_>>>()?;
-        Ok(OrderingEquivalentClass::new(head, others))
+            .collect::<Vec<_>>();
+        OrderingEquivalentClass::new(head, others)
     }
 
     /// This function normalizes `OrderingEquivalenceProperties` according to `eq_properties`.
@@ -885,6 +885,239 @@ fn req_satisfied(given: LexOrderingRef, req: &[PhysicalSortRequirement]) -> bool
     true
 }
 
+/// Combine equivalence properties of the given join inputs.
+pub fn combine_join_equivalence_properties(
+    join_type: JoinType,
+    left_properties: EquivalenceProperties,
+    right_properties: EquivalenceProperties,
+    left_columns_len: usize,
+    on: &[(Column, Column)],
+    schema: SchemaRef,
+) -> EquivalenceProperties {
+    let mut new_properties = EquivalenceProperties::new(schema);
+    match join_type {
+        JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
+            new_properties.extend(left_properties.classes().to_vec());
+            let new_right_properties = right_properties
+                .classes()
+                .iter()
+                .map(|prop| {
+                    let new_head = Column::new(
+                        prop.head().name(),
+                        left_columns_len + prop.head().index(),
+                    );
+                    let new_others = prop
+                        .others()
+                        .iter()
+                        .map(|col| {
+                            Column::new(col.name(), left_columns_len + col.index())
+                        })
+                        .collect::<Vec<_>>();
+                    EquivalentClass::new(new_head, new_others)
+                })
+                .collect::<Vec<_>>();
+
+            new_properties.extend(new_right_properties);
+        }
+        JoinType::LeftSemi | JoinType::LeftAnti => {
+            new_properties.extend(left_properties.classes().to_vec())
+        }
+        JoinType::RightSemi | JoinType::RightAnti => {
+            new_properties.extend(right_properties.classes().to_vec())
+        }
+    }
+
+    if join_type == JoinType::Inner {
+        on.iter().for_each(|(column1, column2)| {
+            let new_column2 =
+                Column::new(column2.name(), left_columns_len + column2.index());
+            new_properties.add_equal_conditions((column1, &new_column2))
+        })
+    }
+    new_properties
+}
+
+/// Calculate equivalence properties for the given cross join operation.
+pub fn cross_join_equivalence_properties(
+    left_properties: EquivalenceProperties,
+    right_properties: EquivalenceProperties,
+    left_columns_len: usize,
+    schema: SchemaRef,
+) -> EquivalenceProperties {
+    let mut new_properties = EquivalenceProperties::new(schema);
+    new_properties.extend(left_properties.classes().to_vec());
+    let new_right_properties = right_properties
+        .classes()
+        .iter()
+        .map(|prop| {
+            let new_head =
+                Column::new(prop.head().name(), left_columns_len + prop.head().index());
+            let new_others = prop
+                .others()
+                .iter()
+                .map(|col| Column::new(col.name(), left_columns_len + col.index()))
+                .collect::<Vec<_>>();
+            EquivalentClass::new(new_head, new_others)
+        })
+        .collect::<Vec<_>>();
+    new_properties.extend(new_right_properties);
+    new_properties
+}
+
+/// Update right table ordering equivalences so that:
+/// - They point to valid indices at the output of the join schema, and
+/// - They are normalized with respect to equivalence columns.
+///
+/// To do so, we increment column indices by the size of the left table when
+/// join schema consists of a combination of left and right schema (Inner,
+/// Left, Full, Right joins). Then, we normalize the sort expressions of
+/// ordering equivalences one by one. We make sure that each expression in the
+/// ordering equivalence is either:
+/// - The head of the one of the equivalent classes, or
+/// - Doesn't have an equivalent column.
+///
+/// This way; once we normalize an expression according to equivalence properties,
+/// it can thereafter safely be used for ordering equivalence normalization.
+fn get_updated_right_ordering_equivalent_class(
+    join_type: &JoinType,
+    right_oeq_class: &OrderingEquivalentClass,
+    left_columns_len: usize,
+    join_eq_properties: &EquivalenceProperties,
+) -> OrderingEquivalentClass {
+    match join_type {
+        // In these modes, indices of the right schema should be offset by
+        // the left table size.
+        JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
+            let right_oeq_class = right_oeq_class.add_offset(left_columns_len);
+            return right_oeq_class
+                .normalize_with_equivalence_properties(join_eq_properties);
+        }
+        _ => {}
+    };
+    right_oeq_class.normalize_with_equivalence_properties(join_eq_properties)
+}
+
+/// Calculate ordering equivalence properties for the given join operation.
+pub fn combine_join_ordering_equivalence_properties(
+    join_type: &JoinType,
+    left_oeq_properties: &OrderingEquivalenceProperties,
+    right_oeq_properties: &OrderingEquivalenceProperties,
+    schema: SchemaRef,
+    maintains_input_order: &[bool],
+    probe_side: Option<JoinSide>,
+    join_eq_properties: EquivalenceProperties,
+) -> OrderingEquivalenceProperties {
+    let mut new_properties = OrderingEquivalenceProperties::new(schema);
+    let left_columns_len = left_oeq_properties.schema().fields().len();
+    // All joins have 2 children
+    assert_eq!(maintains_input_order.len(), 2);
+    let left_maintains = maintains_input_order[0];
+    let right_maintains = maintains_input_order[1];
+    match (left_maintains, right_maintains) {
+        (true, true) => {
+            unreachable!("Cannot maintain ordering of both sides");
+        }
+        (true, false) => {
+            // In this special case, right side ordering can be prefixed with left side ordering.
+            if let (
+                Some(JoinSide::Left),
+                JoinType::Inner,
+                Some(left_oeq_class),
+                Some(right_oeq_class),
+            ) = (
+                probe_side,
+                join_type,
+                left_oeq_properties.oeq_class(),
+                right_oeq_properties.oeq_class(),
+            ) {
+                let updated_right_oeq = get_updated_right_ordering_equivalent_class(
+                    join_type,
+                    right_oeq_class,
+                    left_columns_len,
+                    &join_eq_properties,
+                );
+
+                // Right side ordering equivalence properties should be prepended with
+                // those of the left side while constructing output ordering equivalence
+                // properties since stream side is the left side.
+                //
+                // If the right table ordering equivalences contain `b ASC`, and the output
+                // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC`
+                // for the right table should be converted to `a ASC, b ASC` before it is added
+                // to the ordering equivalences of the join.
+                let mut orderings = vec![];
+                for left_ordering in left_oeq_class.iter() {
+                    for right_ordering in updated_right_oeq.iter() {
+                        let mut ordering = left_ordering.to_vec();
+                        ordering.extend(right_ordering.to_vec());
+                        let ordering_normalized =
+                            join_eq_properties.normalize_sort_exprs(&ordering);
+                        orderings.push(ordering_normalized);
+                    }
+                }
+                if !orderings.is_empty() {
+                    let head = orderings.swap_remove(0);
+                    let new_oeq_class = OrderingEquivalentClass::new(head, orderings);
+                    new_properties.extend(Some(new_oeq_class));
+                }
+            } else {
+                new_properties.extend(left_oeq_properties.oeq_class().cloned());
+            }
+        }
+        (false, true) => {
+            let updated_right_oeq =
+                right_oeq_properties.oeq_class().map(|right_oeq_class| {
+                    get_updated_right_ordering_equivalent_class(
+                        join_type,
+                        right_oeq_class,
+                        left_columns_len,
+                        &join_eq_properties,
+                    )
+                });
+            // In this special case, left side ordering can be prefixed with right side ordering.
+            if let (
+                Some(JoinSide::Right),
+                JoinType::Inner,
+                Some(left_oeq_class),
+                Some(right_oeg_class),
+            ) = (
+                probe_side,
+                join_type,
+                left_oeq_properties.oeq_class(),
+                &updated_right_oeq,
+            ) {
+                // Left side ordering equivalence properties should be prepended with
+                // those of the right side while constructing output ordering equivalence
+                // properties since stream side is the right side.
+                //
+                // If the right table ordering equivalences contain `b ASC`, and the output
+                // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC`
+                // for the right table should be converted to `a ASC, b ASC` before it is added
+                // to the ordering equivalences of the join.
+                let mut orderings = vec![];
+                for right_ordering in right_oeg_class.iter() {
+                    for left_ordering in left_oeq_class.iter() {
+                        let mut ordering = right_ordering.to_vec();
+                        ordering.extend(left_ordering.to_vec());
+                        let ordering_normalized =
+                            join_eq_properties.normalize_sort_exprs(&ordering);
+                        orderings.push(ordering_normalized);
+                    }
+                }
+                if !orderings.is_empty() {
+                    let head = orderings.swap_remove(0);
+                    let new_oeq_class = OrderingEquivalentClass::new(head, orderings);
+                    new_properties.extend(Some(new_oeq_class));
+                }
+            } else {
+                new_properties.extend(updated_right_oeq);
+            }
+        }
+        (false, false) => {}
+    }
+    new_properties
+}
+
 /// This function searches for the slice `section` inside the slice `given`.
 /// It returns each range where `section` is compatible with the corresponding
 /// slice in `given`.
@@ -935,10 +1168,10 @@ fn prune_sort_reqs_with_constants(
 
 /// Adds the `offset` value to `Column` indices inside `expr`. This function is
 /// generally used during the update of the right table schema in join operations.
-pub(crate) fn add_offset_to_expr(
+pub fn add_offset_to_expr(
     expr: Arc<dyn PhysicalExpr>,
     offset: usize,
-) -> Result<Arc<dyn PhysicalExpr>> {
+) -> Arc<dyn PhysicalExpr> {
     expr.transform_down(&|e| match e.as_any().downcast_ref::<Column>() {
         Some(col) => Ok(Transformed::Yes(Arc::new(Column::new(
             col.name(),
@@ -946,17 +1179,20 @@ pub(crate) fn add_offset_to_expr(
         )))),
         None => Ok(Transformed::No(e)),
     })
+    .unwrap()
+    // Note that we can safely unwrap here since our transform always returns
+    // an `Ok` value.
 }
 
 /// Adds the `offset` value to `Column` indices inside `sort_expr.expr`.
 pub(crate) fn add_offset_to_sort_expr(
     sort_expr: &PhysicalSortExpr,
     offset: usize,
-) -> Result<PhysicalSortExpr> {
-    Ok(PhysicalSortExpr {
-        expr: add_offset_to_expr(sort_expr.expr.clone(), offset)?,
+) -> PhysicalSortExpr {
+    PhysicalSortExpr {
+        expr: add_offset_to_expr(sort_expr.expr.clone(), offset),
         options: sort_expr.options,
-    })
+    }
 }
 
 /// Adds the `offset` value to `Column` indices for each `sort_expr.expr`
@@ -964,7 +1200,7 @@ pub(crate) fn add_offset_to_sort_expr(
 pub fn add_offset_to_lex_ordering(
     sort_exprs: LexOrderingRef,
     offset: usize,
-) -> Result<LexOrdering> {
+) -> LexOrdering {
     sort_exprs
         .iter()
         .map(|sort_expr| add_offset_to_sort_expr(sort_expr, offset))
@@ -1131,4 +1367,82 @@ mod tests {
         }
         Ok(())
     }
+
+    #[test]
+    fn test_get_updated_right_ordering_equivalence_properties() -> Result<()> {
+        let join_type = JoinType::Inner;
+
+        let options = SortOptions::default();
+        let right_oeq_class = OrderingEquivalentClass::new(
+            vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("x", 0)),
+                    options,
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("y", 1)),
+                    options,
+                },
+            ],
+            vec![vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("z", 2)),
+                    options,
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("w", 3)),
+                    options,
+                },
+            ]],
+        );
+
+        let left_columns_len = 4;
+
+        let fields: Fields = ["a", "b", "c", "d", "x", "y", "z", "w"]
+            .into_iter()
+            .map(|name| Field::new(name, DataType::Int32, true))
+            .collect();
+
+        let mut join_eq_properties =
+            EquivalenceProperties::new(Arc::new(Schema::new(fields)));
+        join_eq_properties
+            .add_equal_conditions((&Column::new("a", 0), &Column::new("x", 4)));
+        join_eq_properties
+            .add_equal_conditions((&Column::new("d", 3), &Column::new("w", 7)));
+
+        let result = get_updated_right_ordering_equivalent_class(
+            &join_type,
+            &right_oeq_class,
+            left_columns_len,
+            &join_eq_properties,
+        );
+
+        let expected = OrderingEquivalentClass::new(
+            vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    options,
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("y", 5)),
+                    options,
+                },
+            ],
+            vec![vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("z", 6)),
+                    options,
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("d", 3)),
+                    options,
+                },
+            ]],
+        );
+
+        assert_eq!(result.head(), expected.head());
+        assert_eq!(result.others(), expected.others());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs
index e670380e59d2..977542bd8e66 100644
--- a/datafusion/physical-expr/src/lib.rs
+++ b/datafusion/physical-expr/src/lib.rs
@@ -54,10 +54,10 @@ pub use aggregate::groups_accumulator::{
 pub use aggregate::AggregateExpr;
 pub use analysis::{analyze, AnalysisContext, ExprBoundaries};
 pub use equivalence::{
-    add_offset_to_lex_ordering, ordering_equivalence_properties_helper,
-    project_equivalence_properties, project_ordering_equivalence_properties,
-    EquivalenceProperties, EquivalentClass, OrderingEquivalenceProperties,
-    OrderingEquivalentClass,
+    add_offset_to_expr, add_offset_to_lex_ordering,
+    ordering_equivalence_properties_helper, project_equivalence_properties,
+    project_ordering_equivalence_properties, EquivalenceProperties, EquivalentClass,
+    OrderingEquivalenceProperties, OrderingEquivalentClass,
 };
 
 pub use partitioning::{Distribution, Partitioning};
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index 1ffd9ad1c18a..c9c92e8c11ae 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -21,8 +21,7 @@
 use std::{any::Any, sync::Arc, task::Poll};
 
 use super::utils::{
-    adjust_right_output_partitioning, cross_join_equivalence_properties,
-    BuildProbeJoinMetrics, OnceAsync, OnceFut,
+    adjust_right_output_partitioning, BuildProbeJoinMetrics, OnceAsync, OnceFut,
 };
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::DisplayAs;
@@ -41,6 +40,7 @@ use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
 
 use async_trait::async_trait;
+use datafusion_physical_expr::equivalence::cross_join_equivalence_properties;
 use futures::{ready, StreamExt};
 use futures::{Stream, TryStreamExt};
 
diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs
index 2ffa1f61a23b..9aa776fe054c 100644
--- a/datafusion/physical-plan/src/joins/hash_join.rs
+++ b/datafusion/physical-plan/src/joins/hash_join.rs
@@ -26,8 +26,8 @@ use std::{any::Any, usize, vec};
 
 use crate::joins::utils::{
     adjust_indices_by_join_type, apply_join_filter_to_indices, build_batch_from_indices,
-    calculate_join_output_ordering, combine_join_ordering_equivalence_properties,
-    get_final_indices_from_bit_map, need_produce_result_in_final,
+    calculate_join_output_ordering, get_final_indices_from_bit_map,
+    need_produce_result_in_final,
 };
 use crate::DisplayAs;
 use crate::{
@@ -39,9 +39,8 @@ use crate::{
     joins::hash_join_utils::{JoinHashMap, JoinHashMapType},
     joins::utils::{
         adjust_right_output_partitioning, build_join_schema, check_join_is_valid,
-        combine_join_equivalence_properties, estimate_join_statistics,
-        partitioned_join_output_partitioning, BuildProbeJoinMetrics, ColumnIndex,
-        JoinFilter, JoinOn,
+        estimate_join_statistics, partitioned_join_output_partitioning,
+        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinOn,
     },
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
     DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning,
@@ -72,6 +71,9 @@ use datafusion_physical_expr::OrderingEquivalenceProperties;
 
 use ahash::RandomState;
 use arrow::compute::kernels::cmp::{eq, not_distinct};
+use datafusion_physical_expr::equivalence::{
+    combine_join_equivalence_properties, combine_join_ordering_equivalence_properties,
+};
 use futures::{ready, Stream, StreamExt, TryStreamExt};
 
 type JoinLeftData = (JoinHashMap, RecordBatch, MemoryReservation);
@@ -146,7 +148,7 @@ impl HashJoinExec {
             left_schema.fields.len(),
             &Self::maintains_input_order(*join_type),
             Some(Self::probe_side()),
-        )?;
+        );
 
         Ok(HashJoinExec {
             left,
@@ -380,14 +382,13 @@ impl ExecutionPlan for HashJoinExec {
     fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties {
         combine_join_ordering_equivalence_properties(
             &self.join_type,
-            &self.left,
-            &self.right,
+            &self.left.ordering_equivalence_properties(),
+            &self.right.ordering_equivalence_properties(),
             self.schema(),
             &self.maintains_input_order(),
             Some(Self::probe_side()),
             self.equivalence_properties(),
         )
-        .unwrap()
     }
 
     fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs
index 25cb374e941f..a113066e39d1 100644
--- a/datafusion/physical-plan/src/joins/nested_loop_join.rs
+++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs
@@ -27,11 +27,10 @@ use std::task::Poll;
 use crate::coalesce_batches::concat_batches;
 use crate::joins::utils::{
     append_right_indices, apply_join_filter_to_indices, build_batch_from_indices,
-    build_join_schema, check_join_is_valid, combine_join_equivalence_properties,
-    estimate_join_statistics, get_anti_indices, get_anti_u64_indices,
-    get_final_indices_from_bit_map, get_semi_indices, get_semi_u64_indices,
-    partitioned_join_output_partitioning, BuildProbeJoinMetrics, ColumnIndex, JoinFilter,
-    OnceAsync, OnceFut,
+    build_join_schema, check_join_is_valid, estimate_join_statistics, get_anti_indices,
+    get_anti_u64_indices, get_final_indices_from_bit_map, get_semi_indices,
+    get_semi_u64_indices, partitioned_join_output_partitioning, BuildProbeJoinMetrics,
+    ColumnIndex, JoinFilter, OnceAsync, OnceFut,
 };
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::{
@@ -51,6 +50,7 @@ use datafusion_execution::TaskContext;
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortExpr};
 
+use datafusion_physical_expr::equivalence::combine_join_equivalence_properties;
 use futures::{ready, Stream, StreamExt, TryStreamExt};
 
 /// Data of the inner table side
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs
index 98fe751b22ef..759149a64d9f 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs
@@ -33,7 +33,6 @@ use std::task::{Context, Poll};
 use crate::expressions::{Column, PhysicalSortExpr};
 use crate::joins::utils::{
     build_join_schema, calculate_join_output_ordering, check_join_is_valid,
-    combine_join_equivalence_properties, combine_join_ordering_equivalence_properties,
     estimate_join_statistics, partitioned_join_output_partitioning, JoinOn,
 };
 use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet};
@@ -55,6 +54,9 @@ use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::{OrderingEquivalenceProperties, PhysicalSortRequirement};
 
+use datafusion_physical_expr::equivalence::{
+    combine_join_equivalence_properties, combine_join_ordering_equivalence_properties,
+};
 use futures::{Stream, StreamExt};
 
 /// join execution plan executes partitions in parallel and combines them into a set of
@@ -140,7 +142,7 @@ impl SortMergeJoinExec {
             left_schema.fields.len(),
             &Self::maintains_input_order(join_type),
             Some(Self::probe_side(&join_type)),
-        )?;
+        );
 
         let schema =
             Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0);
@@ -297,14 +299,13 @@ impl ExecutionPlan for SortMergeJoinExec {
     fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties {
         combine_join_ordering_equivalence_properties(
             &self.join_type,
-            &self.left,
-            &self.right,
+            &self.left.ordering_equivalence_properties(),
+            &self.right.ordering_equivalence_properties(),
             self.schema(),
             &self.maintains_input_order(),
             Some(Self::probe_side(&self.join_type)),
             self.equivalence_properties(),
         )
-        .unwrap()
     }
 
     fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
index 3450331133bd..00d43aead434 100644
--- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
+++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
@@ -42,8 +42,8 @@ use crate::joins::hash_join_utils::{
 };
 use crate::joins::utils::{
     build_batch_from_indices, build_join_schema, check_join_is_valid,
-    combine_join_equivalence_properties, partitioned_join_output_partitioning,
-    prepare_sorted_exprs, ColumnIndex, JoinFilter, JoinOn,
+    partitioned_join_output_partitioning, prepare_sorted_exprs, ColumnIndex, JoinFilter,
+    JoinOn,
 };
 use crate::{
     expressions::{Column, PhysicalSortExpr},
@@ -66,6 +66,7 @@ use datafusion_execution::TaskContext;
 use datafusion_physical_expr::intervals::ExprIntervalGraph;
 
 use ahash::RandomState;
+use datafusion_physical_expr::equivalence::combine_join_equivalence_properties;
 use futures::stream::{select, BoxStream};
 use futures::{Stream, StreamExt};
 use hashbrown::HashSet;
diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs
index afde986c0bb6..cf150ddf575f 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -25,10 +25,7 @@ use std::usize;
 
 use crate::joins::hash_join_utils::{build_filter_input_order, SortedFilterExpr};
 use crate::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder};
-use crate::{
-    ColumnStatistics, EquivalenceProperties, ExecutionPlan, Partitioning, SchemaRef,
-    Statistics,
-};
+use crate::{ColumnStatistics, ExecutionPlan, Partitioning, Statistics};
 
 use arrow::array::{
     downcast_array, new_null_array, Array, BooleanBufferBuilder, UInt32Array,
@@ -39,18 +36,16 @@ use arrow::datatypes::{Field, Schema, SchemaBuilder};
 use arrow::record_batch::{RecordBatch, RecordBatchOptions};
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{
-    exec_err, plan_datafusion_err, plan_err, DataFusionError, JoinSide, JoinType, Result,
+    plan_datafusion_err, plan_err, DataFusionError, JoinSide, JoinType, Result,
     SharedResult,
 };
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::intervals::{ExprIntervalGraph, Interval, IntervalBound};
 use datafusion_physical_expr::utils::merge_vectors;
 use datafusion_physical_expr::{
-    add_offset_to_lex_ordering, EquivalentClass, LexOrdering, LexOrderingRef,
-    OrderingEquivalenceProperties, OrderingEquivalentClass, PhysicalExpr,
-    PhysicalSortExpr,
+    add_offset_to_expr, add_offset_to_lex_ordering, LexOrdering, LexOrderingRef,
+    PhysicalExpr, PhysicalSortExpr,
 };
 
 use futures::future::{BoxFuture, Shared};
@@ -137,16 +132,7 @@ pub fn adjust_right_output_partitioning(
         Partitioning::Hash(exprs, size) => {
             let new_exprs = exprs
                 .into_iter()
-                .map(|expr| {
-                    expr.transform_down(&|e| match e.as_any().downcast_ref::<Column>() {
-                        Some(col) => Ok(Transformed::Yes(Arc::new(Column::new(
-                            col.name(),
-                            left_columns_len + col.index(),
-                        )))),
-                        None => Ok(Transformed::No(e)),
-                    })
-                    .unwrap()
-                })
+                .map(|expr| add_offset_to_expr(expr, left_columns_len))
                 .collect::<Vec<_>>();
             Partitioning::Hash(new_exprs, size)
         }
@@ -182,7 +168,7 @@ pub fn calculate_join_output_ordering(
     left_columns_len: usize,
     maintains_input_order: &[bool],
     probe_side: Option<JoinSide>,
-) -> Result<Option<LexOrdering>> {
+) -> Option<LexOrdering> {
     // All joins have 2 children:
     assert_eq!(maintains_input_order.len(), 2);
     let left_maintains = maintains_input_order[0];
@@ -191,13 +177,13 @@ pub fn calculate_join_output_ordering(
         // In the case below, right ordering should be offseted with the left
         // side length, since we append the right table to the left table.
         JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
-            add_offset_to_lex_ordering(right_ordering, left_columns_len)?
+            add_offset_to_lex_ordering(right_ordering, left_columns_len)
         }
         _ => right_ordering.to_vec(),
     };
     let output_ordering = match (left_maintains, right_maintains) {
         (true, true) => {
-            return exec_err!("Cannot maintain ordering of both sides");
+            unreachable!("Cannot maintain ordering of both sides");
         }
         (true, false) => {
             // Special case, we can prefix ordering of right side with the ordering of left side.
@@ -226,233 +212,9 @@ pub fn calculate_join_output_ordering(
             }
         }
         // Doesn't maintain ordering, output ordering is None.
-        (false, false) => return Ok(None),
-    };
-    Ok((!output_ordering.is_empty()).then_some(output_ordering))
-}
-
-/// Combine equivalence properties of the given join inputs.
-pub fn combine_join_equivalence_properties(
-    join_type: JoinType,
-    left_properties: EquivalenceProperties,
-    right_properties: EquivalenceProperties,
-    left_columns_len: usize,
-    on: &[(Column, Column)],
-    schema: SchemaRef,
-) -> EquivalenceProperties {
-    let mut new_properties = EquivalenceProperties::new(schema);
-    match join_type {
-        JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
-            new_properties.extend(left_properties.classes().to_vec());
-            let new_right_properties = right_properties
-                .classes()
-                .iter()
-                .map(|prop| {
-                    let new_head = Column::new(
-                        prop.head().name(),
-                        left_columns_len + prop.head().index(),
-                    );
-                    let new_others = prop
-                        .others()
-                        .iter()
-                        .map(|col| {
-                            Column::new(col.name(), left_columns_len + col.index())
-                        })
-                        .collect::<Vec<_>>();
-                    EquivalentClass::new(new_head, new_others)
-                })
-                .collect::<Vec<_>>();
-
-            new_properties.extend(new_right_properties);
-        }
-        JoinType::LeftSemi | JoinType::LeftAnti => {
-            new_properties.extend(left_properties.classes().to_vec())
-        }
-        JoinType::RightSemi | JoinType::RightAnti => {
-            new_properties.extend(right_properties.classes().to_vec())
-        }
-    }
-
-    if join_type == JoinType::Inner {
-        on.iter().for_each(|(column1, column2)| {
-            let new_column2 =
-                Column::new(column2.name(), left_columns_len + column2.index());
-            new_properties.add_equal_conditions((column1, &new_column2))
-        })
-    }
-    new_properties
-}
-
-/// Calculate equivalence properties for the given cross join operation.
-pub fn cross_join_equivalence_properties(
-    left_properties: EquivalenceProperties,
-    right_properties: EquivalenceProperties,
-    left_columns_len: usize,
-    schema: SchemaRef,
-) -> EquivalenceProperties {
-    let mut new_properties = EquivalenceProperties::new(schema);
-    new_properties.extend(left_properties.classes().to_vec());
-    let new_right_properties = right_properties
-        .classes()
-        .iter()
-        .map(|prop| {
-            let new_head =
-                Column::new(prop.head().name(), left_columns_len + prop.head().index());
-            let new_others = prop
-                .others()
-                .iter()
-                .map(|col| Column::new(col.name(), left_columns_len + col.index()))
-                .collect::<Vec<_>>();
-            EquivalentClass::new(new_head, new_others)
-        })
-        .collect::<Vec<_>>();
-    new_properties.extend(new_right_properties);
-    new_properties
-}
-
-/// Update right table ordering equivalences so that:
-/// - They point to valid indices at the output of the join schema, and
-/// - They are normalized with respect to equivalence columns.
-///
-/// To do so, we increment column indices by the size of the left table when
-/// join schema consists of a combination of left and right schema (Inner,
-/// Left, Full, Right joins). Then, we normalize the sort expressions of
-/// ordering equivalences one by one. We make sure that each expression in the
-/// ordering equivalence is either:
-/// - The head of the one of the equivalent classes, or
-/// - Doesn't have an equivalent column.
-///
-/// This way; once we normalize an expression according to equivalence properties,
-/// it can thereafter safely be used for ordering equivalence normalization.
-fn get_updated_right_ordering_equivalent_class(
-    join_type: &JoinType,
-    right_oeq_class: &OrderingEquivalentClass,
-    left_columns_len: usize,
-    join_eq_properties: &EquivalenceProperties,
-) -> Result<OrderingEquivalentClass> {
-    match join_type {
-        // In these modes, indices of the right schema should be offset by
-        // the left table size.
-        JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
-            let right_oeq_class = right_oeq_class.add_offset(left_columns_len)?;
-            return Ok(
-                right_oeq_class.normalize_with_equivalence_properties(join_eq_properties)
-            );
-        }
-        _ => {}
+        (false, false) => return None,
     };
-    Ok(right_oeq_class.normalize_with_equivalence_properties(join_eq_properties))
-}
-
-/// Calculate ordering equivalence properties for the given join operation.
-pub fn combine_join_ordering_equivalence_properties(
-    join_type: &JoinType,
-    left: &Arc<dyn ExecutionPlan>,
-    right: &Arc<dyn ExecutionPlan>,
-    schema: SchemaRef,
-    maintains_input_order: &[bool],
-    probe_side: Option<JoinSide>,
-    join_eq_properties: EquivalenceProperties,
-) -> Result<OrderingEquivalenceProperties> {
-    let mut new_properties = OrderingEquivalenceProperties::new(schema);
-    let left_columns_len = left.schema().fields.len();
-    let left_oeq_properties = left.ordering_equivalence_properties();
-    let right_oeq_properties = right.ordering_equivalence_properties();
-    // All joins have 2 children
-    assert_eq!(maintains_input_order.len(), 2);
-    let left_maintains = maintains_input_order[0];
-    let right_maintains = maintains_input_order[1];
-    match (left_maintains, right_maintains) {
-        (true, true) => return plan_err!("Cannot maintain ordering of both sides"),
-        (true, false) => {
-            new_properties.extend(left_oeq_properties.oeq_class().cloned());
-            // In this special case, right side ordering can be prefixed with left side ordering.
-            if let (
-                Some(JoinSide::Left),
-                // right side have an ordering
-                Some(_),
-                JoinType::Inner,
-                Some(oeq_class),
-            ) = (
-                probe_side,
-                right.output_ordering(),
-                join_type,
-                right_oeq_properties.oeq_class(),
-            ) {
-                let left_output_ordering = left.output_ordering().unwrap_or(&[]);
-
-                let updated_right_oeq = get_updated_right_ordering_equivalent_class(
-                    join_type,
-                    oeq_class,
-                    left_columns_len,
-                    &join_eq_properties,
-                )?;
-
-                // Right side ordering equivalence properties should be prepended with
-                // those of the left side while constructing output ordering equivalence
-                // properties since stream side is the left side.
-                //
-                // If the right table ordering equivalences contain `b ASC`, and the output
-                // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC`
-                // for the right table should be converted to `a ASC, b ASC` before it is added
-                // to the ordering equivalences of the join.
-                let updated_right_oeq_class = updated_right_oeq
-                    .prefix_ordering_equivalent_class_with_existing_ordering(
-                        left_output_ordering,
-                        &join_eq_properties,
-                    );
-                new_properties.extend(Some(updated_right_oeq_class));
-            }
-        }
-        (false, true) => {
-            let updated_right_oeq = right_oeq_properties
-                .oeq_class()
-                .map(|right_oeq_class| {
-                    get_updated_right_ordering_equivalent_class(
-                        join_type,
-                        right_oeq_class,
-                        left_columns_len,
-                        &join_eq_properties,
-                    )
-                })
-                .transpose()?;
-            new_properties.extend(updated_right_oeq);
-            // In this special case, left side ordering can be prefixed with right side ordering.
-            if let (
-                Some(JoinSide::Right),
-                // left side have an ordering
-                Some(_),
-                JoinType::Inner,
-                Some(left_oeq_class),
-            ) = (
-                probe_side,
-                left.output_ordering(),
-                join_type,
-                left_oeq_properties.oeq_class(),
-            ) {
-                let right_output_ordering = right.output_ordering().unwrap_or(&[]);
-                let right_output_ordering =
-                    add_offset_to_lex_ordering(right_output_ordering, left_columns_len)?;
-
-                // Left side ordering equivalence properties should be prepended with
-                // those of the right side while constructing output ordering equivalence
-                // properties since stream side is the right side.
-                //
-                // If the right table ordering equivalences contain `b ASC`, and the output
-                // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC`
-                // for the right table should be converted to `a ASC, b ASC` before it is added
-                // to the ordering equivalences of the join.
-                let updated_left_oeq_class = left_oeq_class
-                    .prefix_ordering_equivalent_class_with_existing_ordering(
-                        &right_output_ordering,
-                        &join_eq_properties,
-                    );
-                new_properties.extend(Some(updated_left_oeq_class));
-            }
-        }
-        (false, false) => {}
-    }
-    Ok(new_properties)
+    (!output_ordering.is_empty()).then_some(output_ordering)
 }
 
 /// Information about the index and placement (left or right) of the columns
@@ -1930,84 +1692,6 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_get_updated_right_ordering_equivalence_properties() -> Result<()> {
-        let join_type = JoinType::Inner;
-
-        let options = SortOptions::default();
-        let right_oeq_class = OrderingEquivalentClass::new(
-            vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("x", 0)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("y", 1)),
-                    options,
-                },
-            ],
-            vec![vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("z", 2)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("w", 3)),
-                    options,
-                },
-            ]],
-        );
-
-        let left_columns_len = 4;
-
-        let fields: Fields = ["a", "b", "c", "d", "x", "y", "z", "w"]
-            .into_iter()
-            .map(|name| Field::new(name, DataType::Int32, true))
-            .collect();
-
-        let mut join_eq_properties =
-            EquivalenceProperties::new(Arc::new(Schema::new(fields)));
-        join_eq_properties
-            .add_equal_conditions((&Column::new("a", 0), &Column::new("x", 4)));
-        join_eq_properties
-            .add_equal_conditions((&Column::new("d", 3), &Column::new("w", 7)));
-
-        let result = get_updated_right_ordering_equivalent_class(
-            &join_type,
-            &right_oeq_class,
-            left_columns_len,
-            &join_eq_properties,
-        )?;
-
-        let expected = OrderingEquivalentClass::new(
-            vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("a", 0)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("y", 5)),
-                    options,
-                },
-            ],
-            vec![vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("z", 6)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("d", 3)),
-                    options,
-                },
-            ]],
-        );
-
-        assert_eq!(result.head(), expected.head());
-        assert_eq!(result.others(), expected.others());
-
-        Ok(())
-    }
-
     #[test]
     fn test_calculate_join_output_ordering() -> Result<()> {
         let options = SortOptions::default();
@@ -2100,7 +1784,7 @@ mod tests {
                     left_columns_len,
                     maintains_input_order,
                     probe_side
-                )?,
+                ),
                 expected[i]
             );
         }

From af2cda928140f85d369e26931756a0cbc127e9ec Mon Sep 17 00:00:00 2001
From: Huaijin <haohuaijin@gmail.com>
Date: Wed, 25 Oct 2023 15:56:13 +0800
Subject: [PATCH 08/32] support scan empty projection  (#7920)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

* modify push_down_filter for empty projection

* clean code

* fix ci

* fix

* Update datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs

Co-authored-by: Daniël Heres <danielheres@gmail.com>

* remove LogicalPlan::Values in push_down_projection

---------

Co-authored-by: Daniël Heres <danielheres@gmail.com>
---
 .../physical_plan/file_scan_config.rs         |  10 +-
 .../provider_filter_pushdown.rs               |  14 +-
 datafusion/core/tests/sql/explain_analyze.rs  |   2 +-
 .../optimizer/src/push_down_projection.rs     | 168 +-----------------
 .../physical-plan/src/joins/cross_join.rs     |   4 +-
 datafusion/sqllogictest/test_files/avro.slt   |   4 +-
 datafusion/sqllogictest/test_files/json.slt   |   4 +-
 .../sqllogictest/test_files/subquery.slt      |   6 +-
 8 files changed, 31 insertions(+), 181 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
index d8a9697b2bf7..3efb0df9df7c 100644
--- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
+++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
@@ -36,7 +36,7 @@ use crate::{
 use arrow::array::{ArrayData, BufferBuilder};
 use arrow::buffer::Buffer;
 use arrow::datatypes::{ArrowNativeType, UInt16Type};
-use arrow_array::{ArrayRef, DictionaryArray, RecordBatch};
+use arrow_array::{ArrayRef, DictionaryArray, RecordBatch, RecordBatchOptions};
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::stats::Precision;
 use datafusion_common::{exec_err, ColumnStatistics, Statistics};
@@ -339,7 +339,13 @@ impl PartitionColumnProjector {
                 ),
             )
         }
-        RecordBatch::try_new(Arc::clone(&self.projected_schema), cols).map_err(Into::into)
+
+        RecordBatch::try_new_with_options(
+            Arc::clone(&self.projected_schema),
+            cols,
+            &RecordBatchOptions::new().with_row_count(Some(file_batch.num_rows())),
+        )
+        .map_err(Into::into)
     }
 }
 
diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
index 4679ca6d07df..e374abd6e891 100644
--- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
+++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
@@ -149,10 +149,12 @@ impl TableProvider for CustomProvider {
     async fn scan(
         &self,
         _state: &SessionState,
-        _: Option<&Vec<usize>>,
+        projection: Option<&Vec<usize>>,
         filters: &[Expr],
         _: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let empty = Vec::new();
+        let projection = projection.unwrap_or(&empty);
         match &filters[0] {
             Expr::BinaryExpr(BinaryExpr { right, .. }) => {
                 let int_value = match &**right {
@@ -182,7 +184,10 @@ impl TableProvider for CustomProvider {
                 };
 
                 Ok(Arc::new(CustomPlan {
-                    schema: self.zero_batch.schema(),
+                    schema: match projection.is_empty() {
+                        true => Arc::new(Schema::empty()),
+                        false => self.zero_batch.schema(),
+                    },
                     batches: match int_value {
                         0 => vec![self.zero_batch.clone()],
                         1 => vec![self.one_batch.clone()],
@@ -191,7 +196,10 @@ impl TableProvider for CustomProvider {
                 }))
             }
             _ => Ok(Arc::new(CustomPlan {
-                schema: self.zero_batch.schema(),
+                schema: match projection.is_empty() {
+                    true => Arc::new(Schema::empty()),
+                    false => self.zero_batch.schema(),
+                },
                 batches: vec![],
             })),
         }
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index 7238369f832a..2436e82f3ce9 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -788,7 +788,7 @@ async fn explain_logical_plan_only() {
             "logical_plan",
             "Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]\
             \n  SubqueryAlias: t\
-            \n    Projection: column2\
+            \n    Projection: \
             \n      Values: (Utf8(\"a\"), Int64(1), Int64(100)), (Utf8(\"a\"), Int64(2), Int64(150))"
         ]];
     assert_eq!(expected, actual);
diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs
index 839f6b5bb8f6..e7fdaa8b0b5e 100644
--- a/datafusion/optimizer/src/push_down_projection.rs
+++ b/datafusion/optimizer/src/push_down_projection.rs
@@ -23,7 +23,6 @@ use crate::merge_projection::merge_projection;
 use crate::optimizer::ApplyOrder;
 use crate::push_down_filter::replace_cols_by_name;
 use crate::{OptimizerConfig, OptimizerRule};
-use arrow::datatypes::DataType;
 use arrow::error::Result as ArrowResult;
 use datafusion_common::ScalarValue::UInt8;
 use datafusion_common::{
@@ -149,10 +148,6 @@ impl OptimizerRule for PushDownProjection {
             {
                 let mut used_columns: HashSet<Column> = HashSet::new();
                 if projection_is_empty {
-                    let field = find_small_field(scan.projected_schema.fields()).ok_or(
-                        DataFusionError::Internal("Scan with empty schema".to_string()),
-                    )?;
-                    used_columns.insert(field.qualified_column());
                     push_down_scan(&used_columns, scan, true)?
                 } else {
                     for expr in projection.expr.iter() {
@@ -163,17 +158,6 @@ impl OptimizerRule for PushDownProjection {
                     plan.with_new_inputs(&[new_scan])?
                 }
             }
-            LogicalPlan::Values(values) if projection_is_empty => {
-                let field = find_small_field(values.schema.fields()).ok_or(
-                    DataFusionError::Internal("Values with empty schema".to_string()),
-                )?;
-                let column = Expr::Column(field.qualified_column());
-
-                LogicalPlan::Projection(Projection::try_new(
-                    vec![column],
-                    Arc::new(child_plan.clone()),
-                )?)
-            }
             LogicalPlan::Union(union) => {
                 let mut required_columns = HashSet::new();
                 exprlist_to_columns(&projection.expr, &mut required_columns)?;
@@ -429,87 +413,6 @@ pub fn collect_projection_expr(projection: &Projection) -> HashMap<String, Expr>
         .collect::<HashMap<_, _>>()
 }
 
-/// Accumulate the memory size of a data type measured in bits.
-///
-/// Types with a variable size get assigned with a fixed size which is greater than most
-/// primitive types.
-///
-/// While traversing nested types, `nesting` is incremented on every level.
-fn nested_size(data_type: &DataType, nesting: &mut usize) -> usize {
-    use DataType::*;
-    if data_type.is_primitive() {
-        return data_type.primitive_width().unwrap_or(1) * 8;
-    }
-
-    if data_type.is_nested() {
-        *nesting += 1;
-    }
-
-    match data_type {
-        Null => 0,
-        Boolean => 1,
-        Binary | Utf8 => 128,
-        LargeBinary | LargeUtf8 => 256,
-        FixedSizeBinary(bytes) => (*bytes * 8) as usize,
-        // primitive types
-        Int8
-        | Int16
-        | Int32
-        | Int64
-        | UInt8
-        | UInt16
-        | UInt32
-        | UInt64
-        | Float16
-        | Float32
-        | Float64
-        | Timestamp(_, _)
-        | Date32
-        | Date64
-        | Time32(_)
-        | Time64(_)
-        | Duration(_)
-        | Interval(_)
-        | Dictionary(_, _)
-        | Decimal128(_, _)
-        | Decimal256(_, _) => data_type.primitive_width().unwrap_or(1) * 8,
-        // nested types
-        List(f) => nested_size(f.data_type(), nesting),
-        FixedSizeList(_, s) => (s * 8) as usize,
-        LargeList(f) => nested_size(f.data_type(), nesting),
-        Struct(fields) => fields
-            .iter()
-            .map(|f| nested_size(f.data_type(), nesting))
-            .sum(),
-        Union(fields, _) => fields
-            .iter()
-            .map(|(_, f)| nested_size(f.data_type(), nesting))
-            .sum(),
-        Map(field, _) => nested_size(field.data_type(), nesting),
-        RunEndEncoded(run_ends, values) => {
-            nested_size(run_ends.data_type(), nesting)
-                + nested_size(values.data_type(), nesting)
-        }
-    }
-}
-
-/// Find a field with a presumable small memory footprint based on its data type's memory size
-/// and the level of nesting.
-fn find_small_field(fields: &[DFField]) -> Option<DFField> {
-    fields
-        .iter()
-        .map(|f| {
-            let nesting = &mut 0;
-            let size = nested_size(f.data_type(), nesting);
-            (*nesting, size)
-        })
-        .enumerate()
-        .min_by(|(_, (nesting_a, size_a)), (_, (nesting_b, size_b))| {
-            nesting_a.cmp(nesting_b).then(size_a.cmp(size_b))
-        })
-        .map(|(i, _)| fields[i].clone())
-}
-
 /// Get the projection exprs from columns in the order of the schema
 fn get_expr(columns: &HashSet<Column>, schema: &DFSchemaRef) -> Result<Vec<Expr>> {
     let expr = schema
@@ -640,7 +543,7 @@ mod tests {
     use crate::optimizer::Optimizer;
     use crate::test::*;
     use crate::OptimizerContext;
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::DFSchema;
     use datafusion_expr::builder::table_scan_with_filters;
     use datafusion_expr::expr;
@@ -1232,73 +1135,4 @@ mod tests {
             .unwrap_or(optimized_plan);
         Ok(optimized_plan)
     }
-
-    #[test]
-    fn test_nested_size() {
-        use DataType::*;
-        let nesting = &mut 0;
-        assert_eq!(nested_size(&Null, nesting), 0);
-        assert_eq!(*nesting, 0);
-        assert_eq!(nested_size(&Boolean, nesting), 1);
-        assert_eq!(*nesting, 0);
-        assert_eq!(nested_size(&UInt8, nesting), 8);
-        assert_eq!(*nesting, 0);
-        assert_eq!(nested_size(&Int64, nesting), 64);
-        assert_eq!(*nesting, 0);
-        assert_eq!(nested_size(&Decimal256(5, 2), nesting), 256);
-        assert_eq!(*nesting, 0);
-        assert_eq!(
-            nested_size(&List(Arc::new(Field::new("A", Int64, true))), nesting),
-            64
-        );
-        assert_eq!(*nesting, 1);
-        *nesting = 0;
-        assert_eq!(
-            nested_size(
-                &List(Arc::new(Field::new(
-                    "A",
-                    List(Arc::new(Field::new("AA", Int64, true))),
-                    true
-                ))),
-                nesting
-            ),
-            64
-        );
-        assert_eq!(*nesting, 2);
-    }
-
-    #[test]
-    fn test_find_small_field() {
-        use DataType::*;
-        let int32 = DFField::from(Field::new("a", Int32, false));
-        let bin = DFField::from(Field::new("b", Binary, false));
-        let list_i64 = DFField::from(Field::new(
-            "c",
-            List(Arc::new(Field::new("c_1", Int64, true))),
-            false,
-        ));
-        let time_s = DFField::from(Field::new("d", Time32(TimeUnit::Second), false));
-
-        assert_eq!(
-            find_small_field(&[
-                int32.clone(),
-                bin.clone(),
-                list_i64.clone(),
-                time_s.clone()
-            ]),
-            Some(int32.clone())
-        );
-        assert_eq!(
-            find_small_field(&[bin.clone(), list_i64.clone(), time_s.clone()]),
-            Some(time_s.clone())
-        );
-        assert_eq!(
-            find_small_field(&[time_s.clone(), int32.clone()]),
-            Some(time_s.clone())
-        );
-        assert_eq!(
-            find_small_field(&[bin.clone(), list_i64.clone()]),
-            Some(bin.clone())
-        );
-    }
 }
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index c9c92e8c11ae..d8c8064e2ac1 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -34,6 +34,7 @@ use crate::{
 
 use arrow::datatypes::{Fields, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
+use arrow_array::RecordBatchOptions;
 use datafusion_common::stats::Precision;
 use datafusion_common::{plan_err, DataFusionError, Result, ScalarValue};
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
@@ -347,13 +348,14 @@ fn build_batch(
         })
         .collect::<Result<Vec<_>>>()?;
 
-    RecordBatch::try_new(
+    RecordBatch::try_new_with_options(
         Arc::new(schema.clone()),
         arrays
             .iter()
             .chain(batch.columns().iter())
             .cloned()
             .collect(),
+        &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())),
     )
     .map_err(Into::into)
 }
diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt
index bd2ba706663c..3f21274c009f 100644
--- a/datafusion/sqllogictest/test_files/avro.slt
+++ b/datafusion/sqllogictest/test_files/avro.slt
@@ -253,10 +253,10 @@ EXPLAIN SELECT count(*) from alltypes_plain
 ----
 logical_plan
 Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
---TableScan: alltypes_plain projection=[bool_col]
+--TableScan: alltypes_plain projection=[]
 physical_plan
 AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
 --CoalescePartitionsExec
 ----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
 ------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
---------AvroExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/alltypes_plain.avro]]}, projection=[bool_col]
+--------AvroExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/alltypes_plain.avro]]}
diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt
index a06565f42cdb..c0d5e895f0f2 100644
--- a/datafusion/sqllogictest/test_files/json.slt
+++ b/datafusion/sqllogictest/test_files/json.slt
@@ -50,13 +50,13 @@ EXPLAIN SELECT count(*) from json_test
 ----
 logical_plan
 Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
---TableScan: json_test projection=[c]
+--TableScan: json_test projection=[]
 physical_plan
 AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
 --CoalescePartitionsExec
 ----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
 ------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
---------JsonExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/2.json]]}, projection=[c]
+--------JsonExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/2.json]]}
 
 query ?
 SELECT mycol FROM single_nan
diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt
index 7cbb848f3333..822a70bb5bad 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -695,7 +695,7 @@ logical_plan
 Projection: __scalar_sq_1.COUNT(*) AS b
 --SubqueryAlias: __scalar_sq_1
 ----Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
-------TableScan: t1 projection=[t1_id]
+------TableScan: t1 projection=[]
 
 #simple_uncorrelated_scalar_subquery2
 query TT
@@ -706,10 +706,10 @@ Projection: __scalar_sq_1.COUNT(*) AS b, __scalar_sq_2.COUNT(Int64(1)) AS COUNT(
 --Left Join:
 ----SubqueryAlias: __scalar_sq_1
 ------Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
---------TableScan: t1 projection=[t1_id]
+--------TableScan: t1 projection=[]
 ----SubqueryAlias: __scalar_sq_2
 ------Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]
---------TableScan: t2 projection=[t2_id]
+--------TableScan: t2 projection=[]
 
 query II
 select (select count(*) from t1) as b, (select count(1) from t2)

From b16cd934698e8c1731f16873fcab17ffae04cc5b Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Wed, 25 Oct 2023 16:43:39 +0300
Subject: [PATCH 09/32] Cleanup logical optimizer rules.  (#7919)

* Initial commit

* Address todos

* Update comments

* Simplifications

* Minor simplifications

* Address reviews

* Add TableScan constructor

* Minor changes

* make try_new_with_schema method of Aggregate private

* Use projection try_new instead of try_new_schema

* Simplifications, add comment

* Review changes

* Improve comments

* Move get_wider_type to type_coercion module

* Clean up type coercion file

---------

Co-authored-by: berkaysynnada <berkay.sahin@synnada.ai>
Co-authored-by: Mehmet Ozan Kabak <ozankabak@gmail.com>
---
 datafusion/common/src/dfschema.rs             |   8 +
 .../common/src/functional_dependencies.rs     |  17 ++
 datafusion/core/tests/sql/group_by.rs         |  14 +-
 datafusion/expr/src/built_in_function.rs      |  45 +++--
 datafusion/expr/src/logical_plan/builder.rs   | 183 +++++-------------
 datafusion/expr/src/logical_plan/plan.rs      | 134 ++++++++-----
 datafusion/expr/src/tree_node/expr.rs         |  22 ++-
 datafusion/expr/src/type_coercion/binary.rs   | 147 ++++++++------
 .../optimizer/src/common_subexpr_eliminate.rs |  36 ++--
 datafusion/optimizer/src/merge_projection.rs  |  11 +-
 .../optimizer/src/push_down_projection.rs     |  58 +++---
 .../src/replace_distinct_aggregate.rs         |  10 +-
 .../src/single_distinct_to_groupby.rs         | 104 ++++++----
 .../physical-expr/src/array_expressions.rs    |  39 ++--
 datafusion/sqllogictest/test_files/array.slt  |   6 +-
 .../sqllogictest/test_files/tpch/q16.slt.part |  20 +-
 16 files changed, 451 insertions(+), 403 deletions(-)

diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index b1aee41978c2..e16acbfedc81 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -444,6 +444,14 @@ impl DFSchema {
                         .zip(iter2)
                         .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_semantically_equal(f1, f2))
             }
+            (
+                DataType::Decimal128(_l_precision, _l_scale),
+                DataType::Decimal128(_r_precision, _r_scale),
+            ) => true,
+            (
+                DataType::Decimal256(_l_precision, _l_scale),
+                DataType::Decimal256(_r_precision, _r_scale),
+            ) => true,
             _ => dt1 == dt2,
         }
     }
diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs
index 869709bc8dfc..fbddcddab4bc 100644
--- a/datafusion/common/src/functional_dependencies.rs
+++ b/datafusion/common/src/functional_dependencies.rs
@@ -558,4 +558,21 @@ mod tests {
         assert_eq!(iter.next(), Some(&Constraint::Unique(vec![20])));
         assert_eq!(iter.next(), None);
     }
+
+    #[test]
+    fn test_get_updated_id_keys() {
+        let fund_dependencies =
+            FunctionalDependencies::new(vec![FunctionalDependence::new(
+                vec![1],
+                vec![0, 1, 2],
+                true,
+            )]);
+        let res = fund_dependencies.project_functional_dependencies(&[1, 2], 2);
+        let expected = FunctionalDependencies::new(vec![FunctionalDependence::new(
+            vec![0],
+            vec![0, 1],
+            true,
+        )]);
+        assert_eq!(res, expected);
+    }
 }
diff --git a/datafusion/core/tests/sql/group_by.rs b/datafusion/core/tests/sql/group_by.rs
index 7c7703b69683..58f0ac21d951 100644
--- a/datafusion/core/tests/sql/group_by.rs
+++ b/datafusion/core/tests/sql/group_by.rs
@@ -231,13 +231,13 @@ async fn group_by_dictionary() {
         .expect("ran plan correctly");
 
         let expected = [
-            "+-------+------------------------+",
-            "| t.val | COUNT(DISTINCT t.dict) |",
-            "+-------+------------------------+",
-            "| 1     | 2                      |",
-            "| 2     | 2                      |",
-            "| 4     | 1                      |",
-            "+-------+------------------------+",
+            "+-----+------------------------+",
+            "| val | COUNT(DISTINCT t.dict) |",
+            "+-----+------------------------+",
+            "| 1   | 2                      |",
+            "| 2   | 2                      |",
+            "| 4   | 1                      |",
+            "+-----+------------------------+",
         ];
         assert_batches_sorted_eq!(expected, &results);
     }
diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs
index 350067a42186..16554133d828 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -17,21 +17,26 @@
 
 //! Built-in functions module contains all the built-in functions definitions.
 
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::fmt;
+use std::str::FromStr;
+use std::sync::{Arc, OnceLock};
+
 use crate::nullif::SUPPORTED_NULLIF_TYPES;
 use crate::signature::TIMEZONE_WILDCARD;
+use crate::type_coercion::binary::get_wider_type;
 use crate::type_coercion::functions::data_types;
 use crate::{
     conditional_expressions, struct_expressions, utils, FuncMonotonicity, Signature,
     TypeSignature, Volatility,
 };
+
 use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit};
 use datafusion_common::{
     internal_err, plan_datafusion_err, plan_err, DataFusionError, Result,
 };
-use std::collections::HashMap;
-use std::fmt;
-use std::str::FromStr;
-use std::sync::{Arc, OnceLock};
+
 use strum::IntoEnumIterator;
 use strum_macros::EnumIter;
 
@@ -468,18 +473,14 @@ impl BuiltinScalarFunction {
     /// * `List(Int64)` has dimension 2
     /// * `List(List(Int64))` has dimension 3
     /// * etc.
-    fn return_dimension(self, input_expr_type: DataType) -> u64 {
-        let mut res: u64 = 1;
+    fn return_dimension(self, input_expr_type: &DataType) -> u64 {
+        let mut result: u64 = 1;
         let mut current_data_type = input_expr_type;
-        loop {
-            match current_data_type {
-                DataType::List(field) => {
-                    current_data_type = field.data_type().clone();
-                    res += 1;
-                }
-                _ => return res,
-            }
+        while let DataType::List(field) = current_data_type {
+            current_data_type = field.data_type();
+            result += 1;
         }
+        result
     }
 
     /// Returns the output [`DataType`] of this function
@@ -538,11 +539,17 @@ impl BuiltinScalarFunction {
                     match input_expr_type {
                         List(field) => {
                             if !field.data_type().equals_datatype(&Null) {
-                                let dims = self.return_dimension(input_expr_type.clone());
-                                if max_dims < dims {
-                                    max_dims = dims;
-                                    expr_type = input_expr_type.clone();
-                                }
+                                let dims = self.return_dimension(input_expr_type);
+                                expr_type = match max_dims.cmp(&dims) {
+                                    Ordering::Greater => expr_type,
+                                    Ordering::Equal => {
+                                        get_wider_type(&expr_type, input_expr_type)?
+                                    }
+                                    Ordering::Less => {
+                                        max_dims = dims;
+                                        input_expr_type.clone()
+                                    }
+                                };
                             }
                         }
                         _ => {
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index cd50dbe79cfd..9ce1d203d1c9 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -17,6 +17,13 @@
 
 //! This module provides a builder for creating LogicalPlans
 
+use std::any::Any;
+use std::cmp::Ordering;
+use std::collections::{HashMap, HashSet};
+use std::convert::TryFrom;
+use std::iter::zip;
+use std::sync::Arc;
+
 use crate::dml::{CopyOptions, CopyTo};
 use crate::expr::Alias;
 use crate::expr_rewriter::{
@@ -24,38 +31,29 @@ use crate::expr_rewriter::{
     normalize_col_with_schemas_and_ambiguity_check, normalize_cols,
     rewrite_sort_cols_by_aggs,
 };
+use crate::logical_plan::{
+    Aggregate, Analyze, CrossJoin, Distinct, EmptyRelation, Explain, Filter, Join,
+    JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Prepare,
+    Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values,
+    Window,
+};
 use crate::type_coercion::binary::comparison_coercion;
-use crate::utils::{columnize_expr, compare_sort_expr};
-use crate::{
-    and, binary_expr, DmlStatement, Operator, TableProviderFilterPushDown, WriteOp,
+use crate::utils::{
+    can_hash, columnize_expr, compare_sort_expr, expand_qualified_wildcard,
+    expand_wildcard, find_valid_equijoin_key_pair, group_window_expr_by_sort_keys,
 };
 use crate::{
-    logical_plan::{
-        Aggregate, Analyze, CrossJoin, Distinct, EmptyRelation, Explain, Filter, Join,
-        JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Prepare,
-        Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values,
-        Window,
-    },
-    utils::{
-        can_hash, expand_qualified_wildcard, expand_wildcard,
-        find_valid_equijoin_key_pair, group_window_expr_by_sort_keys,
-    },
-    Expr, ExprSchemable, TableSource,
+    and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator,
+    TableProviderFilterPushDown, TableSource, WriteOp,
 };
+
 use arrow::datatypes::{DataType, Schema, SchemaRef};
-use datafusion_common::UnnestOptions;
+use datafusion_common::display::ToStringifiedPlan;
 use datafusion_common::{
-    display::ToStringifiedPlan, Column, DFField, DFSchema, DFSchemaRef, DataFusionError,
-    FileType, FunctionalDependencies, OwnedTableReference, Result, ScalarValue,
-    TableReference, ToDFSchema,
+    plan_datafusion_err, plan_err, Column, DFField, DFSchema, DFSchemaRef,
+    DataFusionError, FileType, OwnedTableReference, Result, ScalarValue, TableReference,
+    ToDFSchema, UnnestOptions,
 };
-use datafusion_common::{plan_datafusion_err, plan_err};
-use std::any::Any;
-use std::cmp::Ordering;
-use std::collections::{HashMap, HashSet};
-use std::convert::TryFrom;
-use std::iter::zip;
-use std::sync::Arc;
 
 /// Default table name for unnamed table
 pub const UNNAMED_TABLE: &str = "?table?";
@@ -283,53 +281,9 @@ impl LogicalPlanBuilder {
         projection: Option<Vec<usize>>,
         filters: Vec<Expr>,
     ) -> Result<Self> {
-        let table_name = table_name.into();
-
-        if table_name.table().is_empty() {
-            return plan_err!("table_name cannot be empty");
-        }
-
-        let schema = table_source.schema();
-        let func_dependencies = FunctionalDependencies::new_from_constraints(
-            table_source.constraints(),
-            schema.fields.len(),
-        );
-
-        let projected_schema = projection
-            .as_ref()
-            .map(|p| {
-                let projected_func_dependencies =
-                    func_dependencies.project_functional_dependencies(p, p.len());
-                DFSchema::new_with_metadata(
-                    p.iter()
-                        .map(|i| {
-                            DFField::from_qualified(
-                                table_name.clone(),
-                                schema.field(*i).clone(),
-                            )
-                        })
-                        .collect(),
-                    schema.metadata().clone(),
-                )
-                .map(|df_schema| {
-                    df_schema.with_functional_dependencies(projected_func_dependencies)
-                })
-            })
-            .unwrap_or_else(|| {
-                DFSchema::try_from_qualified_schema(table_name.clone(), &schema).map(
-                    |df_schema| df_schema.with_functional_dependencies(func_dependencies),
-                )
-            })?;
-
-        let table_scan = LogicalPlan::TableScan(TableScan {
-            table_name,
-            source: table_source,
-            projected_schema: Arc::new(projected_schema),
-            projection,
-            filters,
-            fetch: None,
-        });
-        Ok(Self::from(table_scan))
+        TableScan::try_new(table_name, table_source, projection, filters, None)
+            .map(LogicalPlan::TableScan)
+            .map(Self::from)
     }
 
     /// Wrap a plan in a window
@@ -374,7 +328,7 @@ impl LogicalPlanBuilder {
         self,
         expr: impl IntoIterator<Item = impl Into<Expr>>,
     ) -> Result<Self> {
-        Ok(Self::from(project(self.plan, expr)?))
+        project(self.plan, expr).map(Self::from)
     }
 
     /// Select the given column indices
@@ -390,10 +344,9 @@ impl LogicalPlanBuilder {
     /// Apply a filter
     pub fn filter(self, expr: impl Into<Expr>) -> Result<Self> {
         let expr = normalize_col(expr.into(), &self.plan)?;
-        Ok(Self::from(LogicalPlan::Filter(Filter::try_new(
-            expr,
-            Arc::new(self.plan),
-        )?)))
+        Filter::try_new(expr, Arc::new(self.plan))
+            .map(LogicalPlan::Filter)
+            .map(Self::from)
     }
 
     /// Make a builder for a prepare logical plan from the builder's plan
@@ -421,7 +374,7 @@ impl LogicalPlanBuilder {
 
     /// Apply an alias
     pub fn alias(self, alias: impl Into<OwnedTableReference>) -> Result<Self> {
-        Ok(Self::from(subquery_alias(self.plan, alias)?))
+        subquery_alias(self.plan, alias).map(Self::from)
     }
 
     /// Add missing sort columns to all downstream projection
@@ -476,7 +429,7 @@ impl LogicalPlanBuilder {
                     Self::ambiguous_distinct_check(&missing_exprs, missing_cols, &expr)?;
                 }
                 expr.extend(missing_exprs);
-                Ok(project((*input).clone(), expr)?)
+                project((*input).clone(), expr)
             }
             _ => {
                 let is_distinct =
@@ -583,15 +536,14 @@ impl LogicalPlanBuilder {
             fetch: None,
         });
 
-        Ok(Self::from(LogicalPlan::Projection(Projection::try_new(
-            new_expr,
-            Arc::new(sort_plan),
-        )?)))
+        Projection::try_new(new_expr, Arc::new(sort_plan))
+            .map(LogicalPlan::Projection)
+            .map(Self::from)
     }
 
     /// Apply a union, preserving duplicate rows
     pub fn union(self, plan: LogicalPlan) -> Result<Self> {
-        Ok(Self::from(union(self.plan, plan)?))
+        union(self.plan, plan).map(Self::from)
     }
 
     /// Apply a union, removing duplicate rows
@@ -941,11 +893,9 @@ impl LogicalPlanBuilder {
     ) -> Result<Self> {
         let group_expr = normalize_cols(group_expr, &self.plan)?;
         let aggr_expr = normalize_cols(aggr_expr, &self.plan)?;
-        Ok(Self::from(LogicalPlan::Aggregate(Aggregate::try_new(
-            Arc::new(self.plan),
-            group_expr,
-            aggr_expr,
-        )?)))
+        Aggregate::try_new(Arc::new(self.plan), group_expr, aggr_expr)
+            .map(LogicalPlan::Aggregate)
+            .map(Self::from)
     }
 
     /// Create an expression to represent the explanation of the plan
@@ -1203,8 +1153,8 @@ pub fn build_join_schema(
     );
     let mut metadata = left.metadata().clone();
     metadata.extend(right.metadata().clone());
-    Ok(DFSchema::new_with_metadata(fields, metadata)?
-        .with_functional_dependencies(func_dependencies))
+    DFSchema::new_with_metadata(fields, metadata)
+        .map(|schema| schema.with_functional_dependencies(func_dependencies))
 }
 
 /// Errors if one or more expressions have equal names.
@@ -1251,9 +1201,8 @@ pub fn project_with_column_index(
         })
         .collect::<Vec<_>>();
 
-    Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
-        alias_expr, input, schema,
-    )?))
+    Projection::try_new_with_schema(alias_expr, input, schema)
+        .map(LogicalPlan::Projection)
 }
 
 /// Union two logical plans.
@@ -1349,10 +1298,7 @@ pub fn project(
     }
     validate_unique_names("Projections", projected_expr.iter())?;
 
-    Ok(LogicalPlan::Projection(Projection::try_new(
-        projected_expr,
-        Arc::new(plan.clone()),
-    )?))
+    Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection)
 }
 
 /// Create a SubqueryAlias to wrap a LogicalPlan.
@@ -1360,9 +1306,7 @@ pub fn subquery_alias(
     plan: LogicalPlan,
     alias: impl Into<OwnedTableReference>,
 ) -> Result<LogicalPlan> {
-    Ok(LogicalPlan::SubqueryAlias(SubqueryAlias::try_new(
-        plan, alias,
-    )?))
+    SubqueryAlias::try_new(plan, alias).map(LogicalPlan::SubqueryAlias)
 }
 
 /// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema.
@@ -1525,11 +1469,11 @@ pub fn unnest_with_options(
         })
         .collect::<Vec<_>>();
 
-    let schema = Arc::new(
-        DFSchema::new_with_metadata(fields, input_schema.metadata().clone())?
-            // We can use the existing functional dependencies:
-            .with_functional_dependencies(input_schema.functional_dependencies().clone()),
-    );
+    let metadata = input_schema.metadata().clone();
+    let df_schema = DFSchema::new_with_metadata(fields, metadata)?;
+    // We can use the existing functional dependencies:
+    let deps = input_schema.functional_dependencies().clone();
+    let schema = Arc::new(df_schema.with_functional_dependencies(deps));
 
     Ok(LogicalPlan::Unnest(Unnest {
         input: Arc::new(input),
@@ -1541,16 +1485,12 @@ pub fn unnest_with_options(
 
 #[cfg(test)]
 mod tests {
-    use crate::logical_plan::StringifiedPlan;
-    use crate::{col, in_subquery, lit, scalar_subquery, sum};
-    use crate::{expr, expr_fn::exists};
-
     use super::*;
+    use crate::logical_plan::StringifiedPlan;
+    use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery, sum};
 
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::{
-        FunctionalDependence, OwnedTableReference, SchemaError, TableReference,
-    };
+    use datafusion_common::{OwnedTableReference, SchemaError, TableReference};
 
     #[test]
     fn plan_builder_simple() -> Result<()> {
@@ -2051,21 +1991,4 @@ mod tests {
 
         Ok(())
     }
-
-    #[test]
-    fn test_get_updated_id_keys() {
-        let fund_dependencies =
-            FunctionalDependencies::new(vec![FunctionalDependence::new(
-                vec![1],
-                vec![0, 1, 2],
-                true,
-            )]);
-        let res = fund_dependencies.project_functional_dependencies(&[1, 2], 2);
-        let expected = FunctionalDependencies::new(vec![FunctionalDependence::new(
-            vec![0],
-            vec![0, 1],
-            true,
-        )]);
-        assert_eq!(res, expected);
-    }
 }
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index 1c526c7b4030..d62ac8926328 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -17,6 +17,13 @@
 
 //! Logical plan types
 
+use std::collections::{HashMap, HashSet};
+use std::fmt::{self, Debug, Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use super::dml::CopyTo;
+use super::DdlStatement;
 use crate::dml::CopyOptions;
 use crate::expr::{Alias, Exists, InSubquery, Placeholder};
 use crate::expr_rewriter::create_col_from_scalar_expr;
@@ -28,15 +35,11 @@ use crate::utils::{
     grouping_set_expr_count, grouping_set_to_exprlist, inspect_expr_pre,
 };
 use crate::{
-    build_join_schema, Expr, ExprSchemable, TableProviderFilterPushDown, TableSource,
-};
-use crate::{
-    expr_vec_fmt, BinaryExpr, CreateMemoryTable, CreateView, LogicalPlanBuilder, Operator,
+    build_join_schema, expr_vec_fmt, BinaryExpr, CreateMemoryTable, CreateView, Expr,
+    ExprSchemable, LogicalPlanBuilder, Operator, TableProviderFilterPushDown,
+    TableSource,
 };
 
-use super::dml::CopyTo;
-use super::DdlStatement;
-
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::tree_node::{
     RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, TreeNodeVisitor,
@@ -51,11 +54,6 @@ use datafusion_common::{
 pub use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan};
 pub use datafusion_common::{JoinConstraint, JoinType};
 
-use std::collections::{HashMap, HashSet};
-use std::fmt::{self, Debug, Display, Formatter};
-use std::hash::{Hash, Hasher};
-use std::sync::Arc;
-
 /// A LogicalPlan represents the different types of relational
 /// operators (such as Projection, Filter, etc) and can be created by
 /// the SQL query planner and the DataFrame API.
@@ -531,11 +529,11 @@ impl LogicalPlan {
         // so we don't need to recompute Schema.
         match &self {
             LogicalPlan::Projection(projection) => {
-                Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
-                    projection.expr.to_vec(),
-                    Arc::new(inputs[0].clone()),
-                    projection.schema.clone(),
-                )?))
+                // Schema of the projection may change
+                // when its input changes. Hence we should use
+                // `try_new` method instead of `try_new_with_schema`.
+                Projection::try_new(projection.expr.to_vec(), Arc::new(inputs[0].clone()))
+                    .map(LogicalPlan::Projection)
             }
             LogicalPlan::Window(Window {
                 window_expr,
@@ -549,14 +547,16 @@ impl LogicalPlan {
             LogicalPlan::Aggregate(Aggregate {
                 group_expr,
                 aggr_expr,
-                schema,
                 ..
-            }) => Ok(LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
+            }) => Aggregate::try_new(
+                // Schema of the aggregate may change
+                // when its input changes. Hence we should use
+                // `try_new` method instead of `try_new_with_schema`.
                 Arc::new(inputs[0].clone()),
                 group_expr.to_vec(),
                 aggr_expr.to_vec(),
-                schema.clone(),
-            )?)),
+            )
+            .map(LogicalPlan::Aggregate),
             _ => self.with_new_exprs(self.expressions(), inputs),
         }
     }
@@ -590,12 +590,11 @@ impl LogicalPlan {
         inputs: &[LogicalPlan],
     ) -> Result<LogicalPlan> {
         match self {
-            LogicalPlan::Projection(Projection { schema, .. }) => {
-                Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
-                    expr,
-                    Arc::new(inputs[0].clone()),
-                    schema.clone(),
-                )?))
+            // Since expr may be different than the previous expr, schema of the projection
+            // may change. We need to use try_new method instead of try_new_with_schema method.
+            LogicalPlan::Projection(Projection { .. }) => {
+                Projection::try_new(expr, Arc::new(inputs[0].clone()))
+                    .map(LogicalPlan::Projection)
             }
             LogicalPlan::Dml(DmlStatement {
                 table_name,
@@ -672,10 +671,8 @@ impl LogicalPlan {
                 let mut remove_aliases = RemoveAliases {};
                 let predicate = predicate.rewrite(&mut remove_aliases)?;
 
-                Ok(LogicalPlan::Filter(Filter::try_new(
-                    predicate,
-                    Arc::new(inputs[0].clone()),
-                )?))
+                Filter::try_new(predicate, Arc::new(inputs[0].clone()))
+                    .map(LogicalPlan::Filter)
             }
             LogicalPlan::Repartition(Repartition {
                 partitioning_scheme,
@@ -710,18 +707,12 @@ impl LogicalPlan {
                     schema: schema.clone(),
                 }))
             }
-            LogicalPlan::Aggregate(Aggregate {
-                group_expr, schema, ..
-            }) => {
+            LogicalPlan::Aggregate(Aggregate { group_expr, .. }) => {
                 // group exprs are the first expressions
                 let agg_expr = expr.split_off(group_expr.len());
 
-                Ok(LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
-                    Arc::new(inputs[0].clone()),
-                    expr,
-                    agg_expr,
-                    schema.clone(),
-                )?))
+                Aggregate::try_new(Arc::new(inputs[0].clone()), expr, agg_expr)
+                    .map(LogicalPlan::Aggregate)
             }
             LogicalPlan::Sort(Sort { fetch, .. }) => Ok(LogicalPlan::Sort(Sort {
                 expr,
@@ -790,10 +781,8 @@ impl LogicalPlan {
                 }))
             }
             LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => {
-                Ok(LogicalPlan::SubqueryAlias(SubqueryAlias::try_new(
-                    inputs[0].clone(),
-                    alias.clone(),
-                )?))
+                SubqueryAlias::try_new(inputs[0].clone(), alias.clone())
+                    .map(LogicalPlan::SubqueryAlias)
             }
             LogicalPlan::Limit(Limit { skip, fetch, .. }) => {
                 Ok(LogicalPlan::Limit(Limit {
@@ -1953,6 +1942,63 @@ impl Hash for TableScan {
     }
 }
 
+impl TableScan {
+    /// Initialize TableScan with appropriate schema from the given
+    /// arguments.
+    pub fn try_new(
+        table_name: impl Into<OwnedTableReference>,
+        table_source: Arc<dyn TableSource>,
+        projection: Option<Vec<usize>>,
+        filters: Vec<Expr>,
+        fetch: Option<usize>,
+    ) -> Result<Self> {
+        let table_name = table_name.into();
+
+        if table_name.table().is_empty() {
+            return plan_err!("table_name cannot be empty");
+        }
+        let schema = table_source.schema();
+        let func_dependencies = FunctionalDependencies::new_from_constraints(
+            table_source.constraints(),
+            schema.fields.len(),
+        );
+        let projected_schema = projection
+            .as_ref()
+            .map(|p| {
+                let projected_func_dependencies =
+                    func_dependencies.project_functional_dependencies(p, p.len());
+                DFSchema::new_with_metadata(
+                    p.iter()
+                        .map(|i| {
+                            DFField::from_qualified(
+                                table_name.clone(),
+                                schema.field(*i).clone(),
+                            )
+                        })
+                        .collect(),
+                    schema.metadata().clone(),
+                )
+                .map(|df_schema| {
+                    df_schema.with_functional_dependencies(projected_func_dependencies)
+                })
+            })
+            .unwrap_or_else(|| {
+                DFSchema::try_from_qualified_schema(table_name.clone(), &schema).map(
+                    |df_schema| df_schema.with_functional_dependencies(func_dependencies),
+                )
+            })?;
+        let projected_schema = Arc::new(projected_schema);
+        Ok(Self {
+            table_name,
+            source: table_source,
+            projection,
+            projected_schema,
+            filters,
+            fetch,
+        })
+    }
+}
+
 /// Apply Cross Join to two logical plans
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct CrossJoin {
diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs
index f74cc164a7a5..764dcffbced9 100644
--- a/datafusion/expr/src/tree_node/expr.rs
+++ b/datafusion/expr/src/tree_node/expr.rs
@@ -22,9 +22,10 @@ use crate::expr::{
     GetIndexedField, GroupingSet, InList, InSubquery, Like, Placeholder, ScalarFunction,
     ScalarUDF, Sort, TryCast, WindowFunction,
 };
-use crate::Expr;
-use datafusion_common::tree_node::VisitRecursion;
-use datafusion_common::{tree_node::TreeNode, Result};
+use crate::{Expr, GetFieldAccess};
+
+use datafusion_common::tree_node::{TreeNode, VisitRecursion};
+use datafusion_common::Result;
 
 impl TreeNode for Expr {
     fn apply_children<F>(&self, op: &mut F) -> Result<VisitRecursion>
@@ -47,8 +48,19 @@ impl TreeNode for Expr {
             | Expr::TryCast(TryCast { expr, .. })
             | Expr::Sort(Sort { expr, .. })
             | Expr::InSubquery(InSubquery{ expr, .. }) => vec![expr.as_ref().clone()],
-            Expr::GetIndexedField(GetIndexedField { expr, .. }) => {
-                vec![expr.as_ref().clone()]
+            Expr::GetIndexedField(GetIndexedField { expr, field }) => {
+                let expr = expr.as_ref().clone();
+                match field {
+                    GetFieldAccess::ListIndex {key} => {
+                        vec![key.as_ref().clone(), expr]
+                    },
+                    GetFieldAccess::ListRange {start, stop} => {
+                        vec![start.as_ref().clone(), stop.as_ref().clone(), expr]
+                    }
+                    GetFieldAccess::NamedStructField {name: _name} => {
+                        vec![expr]
+                    }
+                }
             }
             Expr::GroupingSet(GroupingSet::Rollup(exprs))
             | Expr::GroupingSet(GroupingSet::Cube(exprs)) => exprs.clone(),
diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs
index a854373e880d..cf93d15e23f0 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -17,17 +17,20 @@
 
 //! Coercion rules for matching argument types for binary operators
 
+use std::sync::Arc;
+
+use crate::Operator;
+
 use arrow::array::{new_empty_array, Array};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::{
-    DataType, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DataType, Field, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
     DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
 };
 
-use datafusion_common::{plan_datafusion_err, Result};
-use datafusion_common::{plan_err, DataFusionError};
-
-use crate::Operator;
+use datafusion_common::{
+    exec_datafusion_err, plan_datafusion_err, plan_err, DataFusionError, Result,
+};
 
 /// The type signature of an instantiation of binary operator expression such as
 /// `lhs + rhs`
@@ -65,83 +68,75 @@ impl Signature {
 
 /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs`
 fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result<Signature> {
+    use arrow::datatypes::DataType::*;
+    use Operator::*;
     match op {
-        Operator::Eq |
-        Operator::NotEq |
-        Operator::Lt |
-        Operator::LtEq |
-        Operator::Gt |
-        Operator::GtEq |
-        Operator::IsDistinctFrom |
-        Operator::IsNotDistinctFrom => {
+        Eq |
+        NotEq |
+        Lt |
+        LtEq |
+        Gt |
+        GtEq |
+        IsDistinctFrom |
+        IsNotDistinctFrom => {
             comparison_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for comparison operation {lhs} {op} {rhs}"
                 )
             })
         }
-        Operator::And | Operator::Or => match (lhs, rhs) {
-            // logical binary boolean operators can only be evaluated in bools or nulls
-            (DataType::Boolean, DataType::Boolean)
-            | (DataType::Null, DataType::Null)
-            | (DataType::Boolean, DataType::Null)
-            | (DataType::Null, DataType::Boolean) => Ok(Signature::uniform(DataType::Boolean)),
-            _ => plan_err!(
+        And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) {
+            // Logical binary boolean operators can only be evaluated for
+            // boolean or null arguments.                   
+            Ok(Signature::uniform(DataType::Boolean))
+        } else {
+            plan_err!(
                 "Cannot infer common argument type for logical boolean operation {lhs} {op} {rhs}"
-            ),
-        },
-        Operator::RegexMatch |
-        Operator::RegexIMatch |
-        Operator::RegexNotMatch |
-        Operator::RegexNotIMatch => {
+            )
+        }
+        RegexMatch | RegexIMatch | RegexNotMatch | RegexNotIMatch => {
             regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for regex operation {lhs} {op} {rhs}"
                 )
             })
         }
-        Operator::BitwiseAnd
-        | Operator::BitwiseOr
-        | Operator::BitwiseXor
-        | Operator::BitwiseShiftRight
-        | Operator::BitwiseShiftLeft => {
+        BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => {
             bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common type for bitwise operation {lhs} {op} {rhs}"
                 )
             })
         }
-        Operator::StringConcat => {
+        StringConcat => {
             string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common string type for string concat operation {lhs} {op} {rhs}"
                 )
             })
         }
-        Operator::AtArrow
-        | Operator::ArrowAt => {
-            array_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
+        AtArrow | ArrowAt => {
+            // ArrowAt and AtArrow check for whether one array ic contained in another.
+            // The result type is boolean. Signature::comparison defines this signature.
+            // Operation has nothing to do with comparison
+            array_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common array type for arrow operation {lhs} {op} {rhs}"
                 )
             })
         }
-        Operator::Plus |
-        Operator::Minus |
-        Operator::Multiply |
-        Operator::Divide|
-        Operator::Modulo =>  {
+        Plus | Minus | Multiply | Divide | Modulo =>  {
             let get_result = |lhs, rhs| {
                 use arrow::compute::kernels::numeric::*;
                 let l = new_empty_array(lhs);
                 let r = new_empty_array(rhs);
 
                 let result = match op {
-                    Operator::Plus => add_wrapping(&l, &r),
-                    Operator::Minus => sub_wrapping(&l, &r),
-                    Operator::Multiply => mul_wrapping(&l, &r),
-                    Operator::Divide => div(&l, &r),
-                    Operator::Modulo => rem(&l, &r),
+                    Plus => add_wrapping(&l, &r),
+                    Minus => sub_wrapping(&l, &r),
+                    Multiply => mul_wrapping(&l, &r),
+                    Divide => div(&l, &r),
+                    Modulo => rem(&l, &r),
                     _ => unreachable!(),
                 };
                 result.map(|x| x.data_type().clone())
@@ -228,7 +223,7 @@ fn math_decimal_coercion(
         (Null, dec_type @ Decimal128(_, _)) | (dec_type @ Decimal128(_, _), Null) => {
             Some((dec_type.clone(), dec_type.clone()))
         }
-        (Decimal128(_, _), Decimal128(_, _)) => {
+        (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => {
             Some((lhs_type.clone(), rhs_type.clone()))
         }
         // Unlike with comparison we don't coerce to a decimal in the case of floating point
@@ -239,9 +234,6 @@ fn math_decimal_coercion(
         (Int8 | Int16 | Int32 | Int64, Decimal128(_, _)) => {
             Some((coerce_numeric_type_to_decimal(lhs_type)?, rhs_type.clone()))
         }
-        (Decimal256(_, _), Decimal256(_, _)) => {
-            Some((lhs_type.clone(), rhs_type.clone()))
-        }
         (Decimal256(_, _), Int8 | Int16 | Int32 | Int64) => Some((
             lhs_type.clone(),
             coerce_numeric_type_to_decimal256(rhs_type)?,
@@ -473,6 +465,54 @@ fn get_wider_decimal_type(
     }
 }
 
+/// Returns the wider type among arguments `lhs` and `rhs`.
+/// The wider type is the type that can safely represent values from both types
+/// without information loss. Returns an Error if types are incompatible.
+pub fn get_wider_type(lhs: &DataType, rhs: &DataType) -> Result<DataType> {
+    use arrow::datatypes::DataType::*;
+    Ok(match (lhs, rhs) {
+        (lhs, rhs) if lhs == rhs => lhs.clone(),
+        // Right UInt is larger than left UInt.
+        (UInt8, UInt16 | UInt32 | UInt64) | (UInt16, UInt32 | UInt64) | (UInt32, UInt64) |
+        // Right Int is larger than left Int.
+        (Int8, Int16 | Int32 | Int64) | (Int16, Int32 | Int64) | (Int32, Int64) |
+        // Right Float is larger than left Float.
+        (Float16, Float32 | Float64) | (Float32, Float64) |
+        // Right String is larger than left String.
+        (Utf8, LargeUtf8) |
+        // Any right type is wider than a left hand side Null.
+        (Null, _) => rhs.clone(),
+        // Left UInt is larger than right UInt.
+        (UInt16 | UInt32 | UInt64, UInt8) | (UInt32 | UInt64, UInt16) | (UInt64, UInt32) |
+        // Left Int is larger than right Int.
+        (Int16 | Int32 | Int64, Int8) | (Int32 | Int64, Int16) | (Int64, Int32) |
+        // Left Float is larger than right Float.
+        (Float32 | Float64, Float16) | (Float64, Float32) |
+        // Left String is larget than right String.
+        (LargeUtf8, Utf8) |
+        // Any left type is wider than a right hand side Null.
+        (_, Null) => lhs.clone(),
+        (List(lhs_field), List(rhs_field)) => {
+            let field_type =
+                get_wider_type(lhs_field.data_type(), rhs_field.data_type())?;
+            if lhs_field.name() != rhs_field.name() {
+                return Err(exec_datafusion_err!(
+                    "There is no wider type that can represent both {lhs} and {rhs}."
+                ));
+            }
+            assert_eq!(lhs_field.name(), rhs_field.name());
+            let field_name = lhs_field.name();
+            let nullable = lhs_field.is_nullable() | rhs_field.is_nullable();
+            List(Arc::new(Field::new(field_name, field_type, nullable)))
+        }
+        (_, _) => {
+            return Err(exec_datafusion_err!(
+                "There is no wider type that can represent both {lhs} and {rhs}."
+            ));
+        }
+    })
+}
+
 /// Convert the numeric data type to the decimal data type.
 /// Now, we just support the signed integer type and floating-point type.
 fn coerce_numeric_type_to_decimal(numeric_type: &DataType) -> Option<DataType> {
@@ -808,14 +848,11 @@ fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
 
 #[cfg(test)]
 mod tests {
-    use arrow::datatypes::DataType;
-
-    use datafusion_common::assert_contains;
-    use datafusion_common::Result;
-
+    use super::*;
     use crate::Operator;
 
-    use super::*;
+    use arrow::datatypes::DataType;
+    use datafusion_common::{assert_contains, Result};
 
     #[test]
     fn test_coercion_error() -> Result<()> {
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index c6b138f8ca36..68a6a5607a1d 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -20,6 +20,8 @@
 use std::collections::{BTreeSet, HashMap};
 use std::sync::Arc;
 
+use crate::{utils, OptimizerConfig, OptimizerRule};
+
 use arrow::datatypes::DataType;
 use datafusion_common::tree_node::{
     RewriteRecursion, TreeNode, TreeNodeRewriter, TreeNodeVisitor, VisitRecursion,
@@ -28,13 +30,10 @@ use datafusion_common::{
     internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result,
 };
 use datafusion_expr::expr::Alias;
-use datafusion_expr::{
-    col,
-    logical_plan::{Aggregate, Filter, LogicalPlan, Projection, Sort, Window},
-    Expr, ExprSchemable,
+use datafusion_expr::logical_plan::{
+    Aggregate, Filter, LogicalPlan, Projection, Sort, Window,
 };
-
-use crate::{utils, OptimizerConfig, OptimizerRule};
+use datafusion_expr::{col, Expr, ExprSchemable};
 
 /// A map from expression's identifier to tuple including
 /// - the expression itself (cloned)
@@ -111,12 +110,7 @@ impl CommonSubexprEliminate {
         projection: &Projection,
         config: &dyn OptimizerConfig,
     ) -> Result<LogicalPlan> {
-        let Projection {
-            expr,
-            input,
-            schema,
-            ..
-        } = projection;
+        let Projection { expr, input, .. } = projection;
         let input_schema = Arc::clone(input.schema());
         let mut expr_set = ExprSet::new();
         let arrays = to_arrays(expr, input_schema, &mut expr_set, ExprMask::Normal)?;
@@ -124,11 +118,9 @@ impl CommonSubexprEliminate {
         let (mut new_expr, new_input) =
             self.rewrite_expr(&[expr], &[&arrays], input, &expr_set, config)?;
 
-        Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
-            pop_expr(&mut new_expr)?,
-            Arc::new(new_input),
-            schema.clone(),
-        )?))
+        // Since projection expr changes, schema changes also. Use try_new method.
+        Projection::try_new(pop_expr(&mut new_expr)?, Arc::new(new_input))
+            .map(LogicalPlan::Projection)
     }
 
     fn try_optimize_filter(
@@ -201,7 +193,6 @@ impl CommonSubexprEliminate {
             group_expr,
             aggr_expr,
             input,
-            schema,
             ..
         } = aggregate;
         let mut expr_set = ExprSet::new();
@@ -247,12 +238,9 @@ impl CommonSubexprEliminate {
         let rewritten = pop_expr(&mut rewritten)?;
 
         if affected_id.is_empty() {
-            Ok(LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
-                Arc::new(new_input),
-                new_group_expr,
-                new_aggr_expr,
-                schema.clone(),
-            )?))
+            // Since group_epxr changes, schema changes also. Use try_new method.
+            Aggregate::try_new(Arc::new(new_input), new_group_expr, new_aggr_expr)
+                .map(LogicalPlan::Aggregate)
         } else {
             let mut agg_exprs = vec![];
 
diff --git a/datafusion/optimizer/src/merge_projection.rs b/datafusion/optimizer/src/merge_projection.rs
index 408055b8e7d4..ec040cba6fe4 100644
--- a/datafusion/optimizer/src/merge_projection.rs
+++ b/datafusion/optimizer/src/merge_projection.rs
@@ -15,14 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::optimizer::ApplyOrder;
-use datafusion_common::Result;
-use datafusion_expr::{Expr, LogicalPlan, Projection};
 use std::collections::HashMap;
 
+use crate::optimizer::ApplyOrder;
 use crate::push_down_filter::replace_cols_by_name;
 use crate::{OptimizerConfig, OptimizerRule};
 
+use datafusion_common::Result;
+use datafusion_expr::{Expr, LogicalPlan, Projection};
+
 /// Optimization rule that merge [LogicalPlan::Projection].
 #[derive(Default)]
 pub struct MergeProjection;
@@ -84,10 +85,10 @@ pub(super) fn merge_projection(
             Err(e) => Err(e),
         })
         .collect::<Result<Vec<_>>>()?;
-    let new_plan = LogicalPlan::Projection(Projection::try_new_with_schema(
+    // Use try_new, since schema changes with changing expressions.
+    let new_plan = LogicalPlan::Projection(Projection::try_new(
         new_exprs,
         child_projection.input.clone(),
-        parent_projection.schema.clone(),
     )?);
     Ok(new_plan)
 }
diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs
index e7fdaa8b0b5e..b05d811cb481 100644
--- a/datafusion/optimizer/src/push_down_projection.rs
+++ b/datafusion/optimizer/src/push_down_projection.rs
@@ -18,6 +18,9 @@
 //! Projection Push Down optimizer rule ensures that only referenced columns are
 //! loaded into memory
 
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::sync::Arc;
+
 use crate::eliminate_project::can_eliminate;
 use crate::merge_projection::merge_projection;
 use crate::optimizer::ApplyOrder;
@@ -26,20 +29,14 @@ use crate::{OptimizerConfig, OptimizerRule};
 use arrow::error::Result as ArrowResult;
 use datafusion_common::ScalarValue::UInt8;
 use datafusion_common::{
-    plan_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, ToDFSchema,
+    plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result,
 };
 use datafusion_expr::expr::{AggregateFunction, Alias};
-use datafusion_expr::utils::exprlist_to_fields;
 use datafusion_expr::{
     logical_plan::{Aggregate, LogicalPlan, Projection, TableScan, Union},
-    utils::{expr_to_columns, exprlist_to_columns},
+    utils::{expr_to_columns, exprlist_to_columns, exprlist_to_fields},
     Expr, LogicalPlanBuilder, SubqueryAlias,
 };
-use std::collections::HashMap;
-use std::{
-    collections::{BTreeSet, HashSet},
-    sync::Arc,
-};
 
 // if projection is empty return projection-new_plan, else return new_plan.
 #[macro_export]
@@ -501,24 +498,14 @@ fn push_down_scan(
         projection.into_iter().collect::<Vec<_>>()
     };
 
-    // create the projected schema
-    let projected_fields: Vec<DFField> = projection
-        .iter()
-        .map(|i| {
-            DFField::from_qualified(scan.table_name.clone(), schema.fields()[*i].clone())
-        })
-        .collect();
-
-    let projected_schema = projected_fields.to_dfschema_ref()?;
-
-    Ok(LogicalPlan::TableScan(TableScan {
-        table_name: scan.table_name.clone(),
-        source: scan.source.clone(),
-        projection: Some(projection),
-        projected_schema,
-        filters: scan.filters.clone(),
-        fetch: scan.fetch,
-    }))
+    TableScan::try_new(
+        scan.table_name.clone(),
+        scan.source.clone(),
+        Some(projection),
+        scan.filters.clone(),
+        scan.fetch,
+    )
+    .map(LogicalPlan::TableScan)
 }
 
 fn restrict_outputs(
@@ -538,25 +525,24 @@ fn restrict_outputs(
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashMap;
+    use std::vec;
+
     use super::*;
     use crate::eliminate_project::EliminateProjection;
     use crate::optimizer::Optimizer;
     use crate::test::*;
     use crate::OptimizerContext;
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::DFSchema;
+    use datafusion_common::{DFField, DFSchema};
     use datafusion_expr::builder::table_scan_with_filters;
-    use datafusion_expr::expr;
-    use datafusion_expr::expr::Cast;
-    use datafusion_expr::WindowFrame;
-    use datafusion_expr::WindowFunction;
+    use datafusion_expr::expr::{self, Cast};
+    use datafusion_expr::logical_plan::{
+        builder::LogicalPlanBuilder, table_scan, JoinType,
+    };
     use datafusion_expr::{
-        col, count, lit,
-        logical_plan::{builder::LogicalPlanBuilder, table_scan, JoinType},
-        max, min, AggregateFunction, Expr,
+        col, count, lit, max, min, AggregateFunction, Expr, WindowFrame, WindowFunction,
     };
-    use std::collections::HashMap;
-    use std::vec;
 
     #[test]
     fn aggregate_no_group_by() -> Result<()> {
diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs
index f58d4b159745..540617b77084 100644
--- a/datafusion/optimizer/src/replace_distinct_aggregate.rs
+++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::optimizer::ApplyOrder;
+use crate::optimizer::{ApplyOrder, ApplyOrder::BottomUp};
 use crate::{OptimizerConfig, OptimizerRule};
+
 use datafusion_common::Result;
 use datafusion_expr::utils::expand_wildcard;
-use datafusion_expr::Distinct;
-use datafusion_expr::{Aggregate, LogicalPlan};
-use ApplyOrder::BottomUp;
+use datafusion_expr::{Aggregate, Distinct, LogicalPlan};
 
 /// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]]
 ///
@@ -54,11 +53,10 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
         match plan {
             LogicalPlan::Distinct(Distinct { input }) => {
                 let group_expr = expand_wildcard(input.schema(), input, None)?;
-                let aggregate = LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
+                let aggregate = LogicalPlan::Aggregate(Aggregate::try_new(
                     input.clone(),
                     group_expr,
                     vec![],
-                    input.schema().clone(), // input schema and aggregate schema are the same in this case
                 )?);
                 Ok(Some(aggregate))
             }
diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs
index a9e65b3e7c77..8e0f93cb5781 100644
--- a/datafusion/optimizer/src/single_distinct_to_groupby.rs
+++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs
@@ -17,8 +17,11 @@
 
 //! single distinct to group by optimizer rule
 
+use std::sync::Arc;
+
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
+
 use datafusion_common::{DFSchema, Result};
 use datafusion_expr::{
     col,
@@ -27,8 +30,8 @@ use datafusion_expr::{
     utils::columnize_expr,
     Expr, ExprSchemable,
 };
+
 use hashbrown::HashSet;
-use std::sync::Arc;
 
 /// single distinct to group by optimizer rule
 ///  ```text
@@ -102,24 +105,50 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                 ..
             }) => {
                 if is_single_distinct_agg(plan)? && !contains_grouping_set(group_expr) {
+                    let fields = schema.fields();
                     // alias all original group_by exprs
-                    let mut group_expr_alias = Vec::with_capacity(group_expr.len());
-                    let mut inner_group_exprs = group_expr
+                    let (mut inner_group_exprs, out_group_expr_with_alias): (
+                        Vec<Expr>,
+                        Vec<(Expr, Option<String>)>,
+                    ) = group_expr
                         .iter()
                         .enumerate()
                         .map(|(i, group_expr)| {
-                            let alias_str = format!("group_alias_{i}");
-                            let alias_expr = group_expr.clone().alias(&alias_str);
-                            group_expr_alias
-                                .push((alias_str, schema.fields()[i].clone()));
-                            alias_expr
+                            if let Expr::Column(_) = group_expr {
+                                // For Column expressions we can use existing expression as is.
+                                (group_expr.clone(), (group_expr.clone(), None))
+                            } else {
+                                // For complex expression write is as alias, to be able to refer
+                                // if from parent operators successfully.
+                                // Consider plan below.
+                                //
+                                // Aggregate: groupBy=[[group_alias_0]], aggr=[[COUNT(alias1)]] [group_alias_0:Int32, COUNT(alias1):Int64;N]\
+                                // --Aggregate: groupBy=[[test.a + Int32(1) AS group_alias_0, test.c AS alias1]], aggr=[[]] [group_alias_0:Int32, alias1:UInt32]\
+                                // ----TableScan: test [a:UInt32, b:UInt32, c:UInt32]
+                                //
+                                // First aggregate(from bottom) refers to `test.a` column.
+                                // Second aggregate refers to the `group_alias_0` column, Which is a valid field in the first aggregate.
+                                // If we were to write plan above as below without alias
+                                //
+                                // Aggregate: groupBy=[[test.a + Int32(1)]], aggr=[[COUNT(alias1)]] [group_alias_0:Int32, COUNT(alias1):Int64;N]\
+                                // --Aggregate: groupBy=[[test.a + Int32(1), test.c AS alias1]], aggr=[[]] [group_alias_0:Int32, alias1:UInt32]\
+                                // ----TableScan: test [a:UInt32, b:UInt32, c:UInt32]
+                                //
+                                // Second aggregate refers to the `test.a + Int32(1)` expression However, its input do not have `test.a` expression in it.
+                                let alias_str = format!("group_alias_{i}");
+                                let alias_expr = group_expr.clone().alias(&alias_str);
+                                (
+                                    alias_expr,
+                                    (col(alias_str), Some(fields[i].qualified_name())),
+                                )
+                            }
                         })
-                        .collect::<Vec<_>>();
+                        .unzip();
 
                     // and they can be referenced by the alias in the outer aggr plan
-                    let outer_group_exprs = group_expr_alias
+                    let outer_group_exprs = out_group_expr_with_alias
                         .iter()
-                        .map(|(alias, _)| col(alias))
+                        .map(|(out_group_expr, _)| out_group_expr.clone())
                         .collect::<Vec<_>>();
 
                     // replace the distinct arg with alias
@@ -181,20 +210,22 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                     // this optimizer has two kinds of alias:
                     // - group_by aggr
                     // - aggr expr
-                    let mut alias_expr: Vec<Expr> = Vec::new();
-                    for (alias, original_field) in group_expr_alias {
-                        alias_expr
-                            .push(col(alias).alias(original_field.qualified_name()));
-                    }
-                    for (i, expr) in new_aggr_exprs.iter().enumerate() {
-                        alias_expr.push(columnize_expr(
-                            expr.clone().alias(
-                                schema.clone().fields()[i + group_expr.len()]
-                                    .qualified_name(),
-                            ),
-                            &outer_aggr_schema,
-                        ));
-                    }
+                    let group_size = group_expr.len();
+                    let alias_expr = out_group_expr_with_alias
+                        .into_iter()
+                        .map(|(group_expr, original_field)| {
+                            if let Some(name) = original_field {
+                                group_expr.alias(name)
+                            } else {
+                                group_expr
+                            }
+                        })
+                        .chain(new_aggr_exprs.iter().enumerate().map(|(idx, expr)| {
+                            let idx = idx + group_size;
+                            let name = fields[idx].qualified_name();
+                            columnize_expr(expr.clone().alias(name), &outer_aggr_schema)
+                        }))
+                        .collect();
 
                     let outer_aggr = LogicalPlan::Aggregate(Aggregate::try_new(
                         Arc::new(inner_agg),
@@ -202,13 +233,10 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                         new_aggr_exprs,
                     )?);
 
-                    Ok(Some(LogicalPlan::Projection(
-                        Projection::try_new_with_schema(
-                            alias_expr,
-                            Arc::new(outer_aggr),
-                            schema.clone(),
-                        )?,
-                    )))
+                    Ok(Some(LogicalPlan::Projection(Projection::try_new(
+                        alias_expr,
+                        Arc::new(outer_aggr),
+                    )?)))
                 } else {
                     Ok(None)
                 }
@@ -362,9 +390,9 @@ mod tests {
             .build()?;
 
         // Should work
-        let expected = "Projection: group_alias_0 AS test.a, COUNT(alias1) AS COUNT(DISTINCT test.b) [a:UInt32, COUNT(DISTINCT test.b):Int64;N]\
-                            \n  Aggregate: groupBy=[[group_alias_0]], aggr=[[COUNT(alias1)]] [group_alias_0:UInt32, COUNT(alias1):Int64;N]\
-                            \n    Aggregate: groupBy=[[test.a AS group_alias_0, test.b AS alias1]], aggr=[[]] [group_alias_0:UInt32, alias1:UInt32]\
+        let expected = "Projection: test.a, COUNT(alias1) AS COUNT(DISTINCT test.b) [a:UInt32, COUNT(DISTINCT test.b):Int64;N]\
+                            \n  Aggregate: groupBy=[[test.a]], aggr=[[COUNT(alias1)]] [a:UInt32, COUNT(alias1):Int64;N]\
+                            \n    Aggregate: groupBy=[[test.a, test.b AS alias1]], aggr=[[]] [a:UInt32, alias1:UInt32]\
                             \n      TableScan: test [a:UInt32, b:UInt32, c:UInt32]";
 
         assert_optimized_plan_equal(&plan, expected)
@@ -408,9 +436,9 @@ mod tests {
             )?
             .build()?;
         // Should work
-        let expected = "Projection: group_alias_0 AS test.a, COUNT(alias1) AS COUNT(DISTINCT test.b), MAX(alias1) AS MAX(DISTINCT test.b) [a:UInt32, COUNT(DISTINCT test.b):Int64;N, MAX(DISTINCT test.b):UInt32;N]\
-                            \n  Aggregate: groupBy=[[group_alias_0]], aggr=[[COUNT(alias1), MAX(alias1)]] [group_alias_0:UInt32, COUNT(alias1):Int64;N, MAX(alias1):UInt32;N]\
-                            \n    Aggregate: groupBy=[[test.a AS group_alias_0, test.b AS alias1]], aggr=[[]] [group_alias_0:UInt32, alias1:UInt32]\
+        let expected = "Projection: test.a, COUNT(alias1) AS COUNT(DISTINCT test.b), MAX(alias1) AS MAX(DISTINCT test.b) [a:UInt32, COUNT(DISTINCT test.b):Int64;N, MAX(DISTINCT test.b):UInt32;N]\
+                            \n  Aggregate: groupBy=[[test.a]], aggr=[[COUNT(alias1), MAX(alias1)]] [a:UInt32, COUNT(alias1):Int64;N, MAX(alias1):UInt32;N]\
+                            \n    Aggregate: groupBy=[[test.a, test.b AS alias1]], aggr=[[]] [a:UInt32, alias1:UInt32]\
                             \n      TableScan: test [a:UInt32, b:UInt32, c:UInt32]";
 
         assert_optimized_plan_equal(&plan, expected)
diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs
index 067a4cfdffc0..af4612272676 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -17,18 +17,22 @@
 
 //! Array expressions
 
+use std::any::type_name;
+use std::sync::Arc;
+
 use arrow::array::*;
 use arrow::buffer::OffsetBuffer;
 use arrow::compute;
 use arrow::datatypes::{DataType, Field, UInt64Type};
 use arrow_buffer::NullBuffer;
-use core::any::type_name;
 use datafusion_common::cast::{as_generic_string_array, as_int64_array, as_list_array};
-use datafusion_common::{exec_err, internal_err, not_impl_err, plan_err, ScalarValue};
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::utils::wrap_into_list_array;
+use datafusion_common::{
+    exec_err, internal_err, not_impl_err, plan_err, DataFusionError, Result,
+};
 use datafusion_expr::ColumnarValue;
+
 use itertools::Itertools;
-use std::sync::Arc;
 
 macro_rules! downcast_arg {
     ($ARG:expr, $ARRAY_TYPE:ident) => {{
@@ -400,34 +404,26 @@ fn array(values: &[ColumnarValue]) -> Result<ArrayRef> {
         .iter()
         .map(|x| match x {
             ColumnarValue::Array(array) => array.clone(),
-            ColumnarValue::Scalar(scalar) => scalar.to_array().clone(),
+            ColumnarValue::Scalar(scalar) => scalar.to_array(),
         })
         .collect();
 
-    let mut data_type = None;
+    let mut data_type = DataType::Null;
     for arg in &arrays {
         let arg_data_type = arg.data_type();
         if !arg_data_type.equals_datatype(&DataType::Null) {
-            data_type = Some(arg_data_type.clone());
+            data_type = arg_data_type.clone();
             break;
-        } else {
-            data_type = Some(DataType::Null);
         }
     }
 
     match data_type {
-        // empty array
-        None => {
-            let list_arr = ScalarValue::new_list(&[], &DataType::Null);
-            Ok(Arc::new(list_arr))
-        }
-        // all nulls, set default data type as int32
-        Some(DataType::Null) => {
-            let null_arr = vec![ScalarValue::Int32(None); arrays.len()];
-            let list_arr = ScalarValue::new_list(null_arr.as_slice(), &DataType::Int32);
-            Ok(Arc::new(list_arr))
+        // Either an empty array or all nulls:
+        DataType::Null => {
+            let array = new_null_array(&DataType::Null, arrays.len());
+            Ok(Arc::new(wrap_into_list_array(array)))
         }
-        Some(data_type) => Ok(array_array(arrays.as_slice(), data_type)?),
+        data_type => array_array(arrays.as_slice(), data_type),
     }
 }
 
@@ -988,7 +984,8 @@ macro_rules! general_repeat_list {
 /// Array_empty SQL function
 pub fn array_empty(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args[0].as_any().downcast_ref::<NullArray>().is_some() {
-        return Ok(args[0].clone());
+        // Make sure to return Boolean type.
+        return Ok(Arc::new(BooleanArray::new_null(args[0].len())));
     }
 
     let array = as_list_array(&args[0])?;
diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt
index f11bc5206eb4..621cb4a8f4c0 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -2322,7 +2322,7 @@ select 1 || make_array(2, 3, 4), 1.0 || make_array(2.0, 3.0, 4.0), 'h' || make_a
 ## array containment operator
 
 # array containment operator with scalars #1 (at arrow)
-query ???????
+query BBBBBBB
 select make_array(1,2,3) @> make_array(1,3),
        make_array(1,2,3) @> make_array(1,4),
        make_array([1,2], [3,4]) @> make_array([1,2]),
@@ -2334,7 +2334,7 @@ select make_array(1,2,3) @> make_array(1,3),
 true false true false false false true
 
 # array containment operator with scalars #2 (arrow at)
-query ???????
+query BBBBBBB
 select make_array(1,3) <@ make_array(1,2,3),
        make_array(1,4) <@ make_array(1,2,3),
        make_array([1,2]) <@ make_array([1,2], [3,4]),
@@ -2465,7 +2465,7 @@ true
 query B
 select empty(make_array(NULL));
 ----
-true
+false
 
 # empty scalar function #4
 query B
diff --git a/datafusion/sqllogictest/test_files/tpch/q16.slt.part b/datafusion/sqllogictest/test_files/tpch/q16.slt.part
index fb9d98b76fe3..b93872929fe5 100644
--- a/datafusion/sqllogictest/test_files/tpch/q16.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/q16.slt.part
@@ -52,9 +52,9 @@ limit 10;
 logical_plan
 Limit: skip=0, fetch=10
 --Sort: supplier_cnt DESC NULLS FIRST, part.p_brand ASC NULLS LAST, part.p_type ASC NULLS LAST, part.p_size ASC NULLS LAST, fetch=10
-----Projection: group_alias_0 AS part.p_brand, group_alias_1 AS part.p_type, group_alias_2 AS part.p_size, COUNT(alias1) AS supplier_cnt
-------Aggregate: groupBy=[[group_alias_0, group_alias_1, group_alias_2]], aggr=[[COUNT(alias1)]]
---------Aggregate: groupBy=[[part.p_brand AS group_alias_0, part.p_type AS group_alias_1, part.p_size AS group_alias_2, partsupp.ps_suppkey AS alias1]], aggr=[[]]
+----Projection: part.p_brand, part.p_type, part.p_size, COUNT(alias1) AS supplier_cnt
+------Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size]], aggr=[[COUNT(alias1)]]
+--------Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size, partsupp.ps_suppkey AS alias1]], aggr=[[]]
 ----------LeftAnti Join: partsupp.ps_suppkey = __correlated_sq_1.s_suppkey
 ------------Projection: partsupp.ps_suppkey, part.p_brand, part.p_type, part.p_size
 --------------Inner Join: partsupp.ps_partkey = part.p_partkey
@@ -69,15 +69,15 @@ physical_plan
 GlobalLimitExec: skip=0, fetch=10
 --SortPreservingMergeExec: [supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST], fetch=10
 ----SortExec: TopK(fetch=10), expr=[supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST]
-------ProjectionExec: expr=[group_alias_0@0 as part.p_brand, group_alias_1@1 as part.p_type, group_alias_2@2 as part.p_size, COUNT(alias1)@3 as supplier_cnt]
---------AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0, group_alias_1@1 as group_alias_1, group_alias_2@2 as group_alias_2], aggr=[COUNT(alias1)]
+------ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, COUNT(alias1)@3 as supplier_cnt]
+--------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[COUNT(alias1)]
 ----------CoalesceBatchesExec: target_batch_size=8192
-------------RepartitionExec: partitioning=Hash([group_alias_0@0, group_alias_1@1, group_alias_2@2], 4), input_partitions=4
---------------AggregateExec: mode=Partial, gby=[group_alias_0@0 as group_alias_0, group_alias_1@1 as group_alias_1, group_alias_2@2 as group_alias_2], aggr=[COUNT(alias1)]
-----------------AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0, group_alias_1@1 as group_alias_1, group_alias_2@2 as group_alias_2, alias1@3 as alias1], aggr=[]
+------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 4), input_partitions=4
+--------------AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[COUNT(alias1)]
+----------------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[]
 ------------------CoalesceBatchesExec: target_batch_size=8192
---------------------RepartitionExec: partitioning=Hash([group_alias_0@0, group_alias_1@1, group_alias_2@2, alias1@3], 4), input_partitions=4
-----------------------AggregateExec: mode=Partial, gby=[p_brand@1 as group_alias_0, p_type@2 as group_alias_1, p_size@3 as group_alias_2, ps_suppkey@0 as alias1], aggr=[]
+--------------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 4), input_partitions=4
+----------------------AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[]
 ------------------------CoalesceBatchesExec: target_batch_size=8192
 --------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(ps_suppkey@0, s_suppkey@0)]
 ----------------------------CoalesceBatchesExec: target_batch_size=8192

From 148f890e212328399687cadac8bee022d829785d Mon Sep 17 00:00:00 2001
From: Devin D'Angelo <devinjdangelo@gmail.com>
Date: Wed, 25 Oct 2023 12:50:03 -0400
Subject: [PATCH 10/32] Parallelize Serialization of Columns within Parquet
 RowGroups (#7655)

* merge main

* fixes and cmt

* review comments, tuning parameters, updating docs

* cargo fmt

* reduce default buffer size to 2 and update docs
---
 datafusion/common/src/config.rs               |  32 +-
 .../src/datasource/file_format/parquet.rs     | 541 +++++++++++-------
 .../test_files/information_schema.slt         |   8 +-
 docs/source/user-guide/configs.md             | 136 ++---
 4 files changed, 439 insertions(+), 278 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 6aefa4e05de2..403241fcce58 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -377,12 +377,32 @@ config_namespace! {
         pub bloom_filter_ndv: Option<u64>, default = None
 
         /// Controls whether DataFusion will attempt to speed up writing
-        /// large parquet files by first writing multiple smaller files
-        /// and then stitching them together into a single large file.
-        /// This will result in faster write speeds, but higher memory usage.
-        /// Also currently unsupported are bloom filters and column indexes
-        /// when single_file_parallelism is enabled.
-        pub allow_single_file_parallelism: bool, default = false
+        /// parquet files by serializing them in parallel. Each column
+        /// in each row group in each output file are serialized in parallel
+        /// leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
+        pub allow_single_file_parallelism: bool, default = true
+
+        /// By default parallel parquet writer is tuned for minimum
+        /// memory usage in a streaming execution plan. You may see
+        /// a performance benefit when writing large parquet files
+        /// by increasing maximum_parallel_row_group_writers and
+        /// maximum_buffered_record_batches_per_stream if your system
+        /// has idle cores and can tolerate additional memory usage.
+        /// Boosting these values is likely worthwhile when
+        /// writing out already in-memory data, such as from a cached
+        /// data frame.
+        pub maximum_parallel_row_group_writers: usize, default = 1
+
+        /// By default parallel parquet writer is tuned for minimum
+        /// memory usage in a streaming execution plan. You may see
+        /// a performance benefit when writing large parquet files
+        /// by increasing maximum_parallel_row_group_writers and
+        /// maximum_buffered_record_batches_per_stream if your system
+        /// has idle cores and can tolerate additional memory usage.
+        /// Boosting these values is likely worthwhile when
+        /// writing out already in-memory data, such as from a cached
+        /// data frame.
+        pub maximum_buffered_record_batches_per_stream: usize, default = 2
 
     }
 }
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index a16db9d43213..62867c0e2b38 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -17,11 +17,42 @@
 
 //! Parquet format abstractions
 
+use arrow_array::RecordBatch;
+use async_trait::async_trait;
+use datafusion_common::stats::Precision;
+use datafusion_physical_plan::metrics::MetricsSet;
+use parquet::arrow::arrow_writer::{
+    compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter,
+    ArrowLeafColumn,
+};
+use parquet::file::writer::SerializedFileWriter;
 use std::any::Any;
 use std::fmt;
 use std::fmt::Debug;
 use std::io::Write;
 use std::sync::Arc;
+use tokio::io::{AsyncWrite, AsyncWriteExt};
+use tokio::sync::mpsc::{self, Receiver, Sender};
+use tokio::task::{JoinHandle, JoinSet};
+
+use crate::datasource::file_format::file_compression_type::FileCompressionType;
+use crate::datasource::statistics::create_max_min_accs;
+use arrow::datatypes::SchemaRef;
+use arrow::datatypes::{Fields, Schema};
+use bytes::{BufMut, BytesMut};
+use datafusion_common::{exec_err, not_impl_err, plan_err, DataFusionError, FileType};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement};
+use futures::{StreamExt, TryStreamExt};
+use hashbrown::HashMap;
+use object_store::{ObjectMeta, ObjectStore};
+use parquet::arrow::{
+    arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter,
+};
+use parquet::file::footer::{decode_footer, decode_metadata};
+use parquet::file::metadata::ParquetMetaData;
+use parquet::file::properties::WriterProperties;
+use parquet::file::statistics::Statistics as ParquetStatistics;
 
 use super::write::demux::start_demuxer_task;
 use super::write::{create_writer, AbortableWrite, FileWriterMode};
@@ -32,12 +63,11 @@ use crate::arrow::array::{
 use crate::arrow::datatypes::DataType;
 use crate::config::ConfigOptions;
 
-use crate::datasource::file_format::file_compression_type::FileCompressionType;
 use crate::datasource::get_col_stats;
 use crate::datasource::physical_plan::{
     FileGroupDisplay, FileMeta, FileSinkConfig, ParquetExec, SchemaAdapter,
 };
-use crate::datasource::statistics::create_max_min_accs;
+
 use crate::error::Result;
 use crate::execution::context::SessionState;
 use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
@@ -47,29 +77,6 @@ use crate::physical_plan::{
     Statistics,
 };
 
-use arrow::datatypes::{Fields, Schema, SchemaRef};
-use datafusion_common::stats::Precision;
-use datafusion_common::{exec_err, not_impl_err, plan_err, DataFusionError, FileType};
-use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement};
-use datafusion_physical_plan::metrics::MetricsSet;
-
-use async_trait::async_trait;
-use bytes::{BufMut, BytesMut};
-use futures::{StreamExt, TryStreamExt};
-use hashbrown::HashMap;
-use object_store::{ObjectMeta, ObjectStore};
-use parquet::arrow::{parquet_to_arrow_schema, AsyncArrowWriter};
-use parquet::column::writer::ColumnCloseResult;
-use parquet::file::footer::{decode_footer, decode_metadata};
-use parquet::file::metadata::ParquetMetaData;
-use parquet::file::properties::WriterProperties;
-use parquet::file::statistics::Statistics as ParquetStatistics;
-use parquet::file::writer::SerializedFileWriter;
-use tokio::io::{AsyncWrite, AsyncWriteExt};
-use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
-use tokio::task::{JoinHandle, JoinSet};
-
 /// The Apache Parquet `FileFormat` implementation
 ///
 /// Note it is recommended these are instead configured on the [`ConfigOptions`]
@@ -668,37 +675,6 @@ impl ParquetSink {
             }
         }
     }
-
-    /// Creates an object store writer for each output partition
-    /// This is used when parallelizing individual parquet file writes.
-    async fn create_object_store_writers(
-        &self,
-        num_partitions: usize,
-        object_store: Arc<dyn ObjectStore>,
-    ) -> Result<Vec<AbortableWrite<Box<dyn AsyncWrite + Send + Unpin>>>> {
-        let mut writers = Vec::new();
-
-        for _ in 0..num_partitions {
-            let file_path = self.config.table_paths[0].prefix();
-            let object_meta = ObjectMeta {
-                location: file_path.clone(),
-                last_modified: chrono::offset::Utc::now(),
-                size: 0,
-                e_tag: None,
-            };
-            writers.push(
-                create_writer(
-                    FileWriterMode::PutMultipart,
-                    FileCompressionType::UNCOMPRESSED,
-                    object_meta.into(),
-                    object_store.clone(),
-                )
-                .await?,
-            );
-        }
-
-        Ok(writers)
-    }
 }
 
 #[async_trait]
@@ -726,29 +702,8 @@ impl DataSink for ParquetSink {
             .runtime_env()
             .object_store(&self.config.object_store_url)?;
 
-        let exec_options = &context.session_config().options().execution;
-
-        let allow_single_file_parallelism =
-            exec_options.parquet.allow_single_file_parallelism;
-
-        // This is a temporary special case until https://github.com/apache/arrow-datafusion/pull/7655
-        // can be pulled in.
-        if allow_single_file_parallelism && self.config.single_file_output {
-            let object_store_writer = self
-                .create_object_store_writers(1, object_store)
-                .await?
-                .remove(0);
-
-            let schema_clone = self.config.output_schema.clone();
-            return output_single_parquet_file_parallelized(
-                object_store_writer,
-                vec![data],
-                schema_clone,
-                parquet_props,
-            )
-            .await
-            .map(|r| r as u64);
-        }
+        let parquet_opts = &context.session_config().options().execution.parquet;
+        let allow_single_file_parallelism = parquet_opts.allow_single_file_parallelism;
 
         let part_col = if !self.config.table_partition_cols.is_empty() {
             Some(self.config.table_partition_cols.clone())
@@ -756,6 +711,12 @@ impl DataSink for ParquetSink {
             None
         };
 
+        let parallel_options = ParallelParquetWriterOptions {
+            max_parallel_row_groups: parquet_opts.maximum_parallel_row_group_writers,
+            max_buffered_record_batches_per_stream: parquet_opts
+                .maximum_buffered_record_batches_per_stream,
+        };
+
         let (demux_task, mut file_stream_rx) = start_demuxer_task(
             data,
             context,
@@ -768,8 +729,35 @@ impl DataSink for ParquetSink {
         let mut file_write_tasks: JoinSet<std::result::Result<usize, DataFusionError>> =
             JoinSet::new();
         while let Some((path, mut rx)) = file_stream_rx.recv().await {
-            let mut writer = self
-                .create_async_arrow_writer(
+            if !allow_single_file_parallelism {
+                let mut writer = self
+                    .create_async_arrow_writer(
+                        ObjectMeta {
+                            location: path,
+                            last_modified: chrono::offset::Utc::now(),
+                            size: 0,
+                            e_tag: None,
+                        }
+                        .into(),
+                        object_store.clone(),
+                        parquet_props.clone(),
+                    )
+                    .await?;
+                file_write_tasks.spawn(async move {
+                    let mut row_count = 0;
+                    while let Some(batch) = rx.recv().await {
+                        row_count += batch.num_rows();
+                        writer.write(&batch).await?;
+                    }
+                    writer.close().await?;
+                    Ok(row_count)
+                });
+            } else {
+                let writer = create_writer(
+                    FileWriterMode::PutMultipart,
+                    // Parquet files as a whole are never compressed, since they
+                    // manage compressed blocks themselves.
+                    FileCompressionType::UNCOMPRESSED,
                     ObjectMeta {
                         location: path,
                         last_modified: chrono::offset::Utc::now(),
@@ -778,19 +766,22 @@ impl DataSink for ParquetSink {
                     }
                     .into(),
                     object_store.clone(),
-                    parquet_props.clone(),
                 )
                 .await?;
-
-            file_write_tasks.spawn(async move {
-                let mut row_count = 0;
-                while let Some(batch) = rx.recv().await {
-                    row_count += batch.num_rows();
-                    writer.write(&batch).await?;
-                }
-                writer.close().await?;
-                Ok(row_count)
-            });
+                let schema = self.get_writer_schema();
+                let props = parquet_props.clone();
+                let parallel_options_clone = parallel_options.clone();
+                file_write_tasks.spawn(async move {
+                    output_single_parquet_file_parallelized(
+                        writer,
+                        rx,
+                        schema,
+                        &props,
+                        parallel_options_clone,
+                    )
+                    .await
+                });
+            }
         }
 
         let mut row_count = 0;
@@ -823,119 +814,228 @@ impl DataSink for ParquetSink {
     }
 }
 
-/// This is the return type when joining subtasks which are serializing parquet files
-/// into memory buffers. The first part of the tuple is the parquet bytes and the
-/// second is how many rows were written into the file.
-type ParquetFileSerializedResult = Result<(Vec<u8>, usize), DataFusionError>;
+/// Consumes a stream of [ArrowLeafColumn] via a channel and serializes them using an [ArrowColumnWriter]
+/// Once the channel is exhausted, returns the ArrowColumnWriter.
+async fn column_serializer_task(
+    mut rx: Receiver<ArrowLeafColumn>,
+    mut writer: ArrowColumnWriter,
+) -> Result<ArrowColumnWriter> {
+    while let Some(col) = rx.recv().await {
+        writer.write(&col)?;
+    }
+    Ok(writer)
+}
 
-/// Parallelizes the serialization of a single parquet file, by first serializing N
-/// independent RecordBatch streams in parallel to parquet files in memory. Another
-/// task then stitches these independent files back together and streams this large
-/// single parquet file to an ObjectStore in multiple parts.
-async fn output_single_parquet_file_parallelized(
-    mut object_store_writer: AbortableWrite<Box<dyn AsyncWrite + Send + Unpin>>,
-    mut data: Vec<SendableRecordBatchStream>,
-    output_schema: Arc<Schema>,
-    parquet_props: &WriterProperties,
-) -> Result<usize> {
-    let mut row_count = 0;
-    // TODO decrease parallelism / buffering:
-    // https://github.com/apache/arrow-datafusion/issues/7591
-    let parallelism = data.len();
-    let mut join_handles: Vec<JoinHandle<ParquetFileSerializedResult>> =
-        Vec::with_capacity(parallelism);
-    for _ in 0..parallelism {
-        let buffer: Vec<u8> = Vec::new();
-        let mut writer = parquet::arrow::arrow_writer::ArrowWriter::try_new(
-            buffer,
-            output_schema.clone(),
-            Some(parquet_props.clone()),
-        )?;
-        let mut data_stream = data.remove(0);
-        join_handles.push(tokio::spawn(async move {
-            let mut inner_row_count = 0;
-            while let Some(batch) = data_stream.next().await.transpose()? {
-                inner_row_count += batch.num_rows();
-                writer.write(&batch)?;
-            }
-            let out = writer.into_inner()?;
-            Ok((out, inner_row_count))
-        }))
+type ColumnJoinHandle = JoinHandle<Result<ArrowColumnWriter>>;
+type ColSender = Sender<ArrowLeafColumn>;
+/// Spawns a parallel serialization task for each column
+/// Returns join handles for each columns serialization task along with a send channel
+/// to send arrow arrays to each serialization task.
+fn spawn_column_parallel_row_group_writer(
+    schema: Arc<Schema>,
+    parquet_props: Arc<WriterProperties>,
+    max_buffer_size: usize,
+) -> Result<(Vec<ColumnJoinHandle>, Vec<ColSender>)> {
+    let schema_desc = arrow_to_parquet_schema(&schema)?;
+    let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?;
+    let num_columns = col_writers.len();
+
+    let mut col_writer_handles = Vec::with_capacity(num_columns);
+    let mut col_array_channels = Vec::with_capacity(num_columns);
+    for writer in col_writers.into_iter() {
+        // Buffer size of this channel limits the number of arrays queued up for column level serialization
+        let (send_array, recieve_array) =
+            mpsc::channel::<ArrowLeafColumn>(max_buffer_size);
+        col_array_channels.push(send_array);
+        col_writer_handles
+            .push(tokio::spawn(column_serializer_task(recieve_array, writer)))
     }
 
-    let mut writer = None;
-    let endpoints: (UnboundedSender<Vec<u8>>, UnboundedReceiver<Vec<u8>>) =
-        tokio::sync::mpsc::unbounded_channel();
-    let (tx, mut rx) = endpoints;
-    let writer_join_handle: JoinHandle<
-        Result<
-            AbortableWrite<Box<dyn tokio::io::AsyncWrite + std::marker::Send + Unpin>>,
-            DataFusionError,
-        >,
-    > = tokio::task::spawn(async move {
-        while let Some(data) = rx.recv().await {
-            // TODO write incrementally
-            // https://github.com/apache/arrow-datafusion/issues/7591
-            object_store_writer.write_all(data.as_slice()).await?;
+    Ok((col_writer_handles, col_array_channels))
+}
+
+/// Settings related to writing parquet files in parallel
+#[derive(Clone)]
+struct ParallelParquetWriterOptions {
+    max_parallel_row_groups: usize,
+    max_buffered_record_batches_per_stream: usize,
+}
+
+/// This is the return type of calling [ArrowColumnWriter].close() on each column
+/// i.e. the Vec of encoded columns which can be appended to a row group
+type RBStreamSerializeResult = Result<(Vec<ArrowColumnChunk>, usize)>;
+
+/// Sends the ArrowArrays in passed [RecordBatch] through the channels to their respective
+/// parallel column serializers.
+async fn send_arrays_to_col_writers(
+    col_array_channels: &[ColSender],
+    rb: &RecordBatch,
+    schema: Arc<Schema>,
+) -> Result<()> {
+    for (tx, array, field) in col_array_channels
+        .iter()
+        .zip(rb.columns())
+        .zip(schema.fields())
+        .map(|((a, b), c)| (a, b, c))
+    {
+        for c in compute_leaves(field, array)? {
+            tx.send(c).await.map_err(|_| {
+                DataFusionError::Internal("Unable to send array to writer!".into())
+            })?;
         }
-        Ok(object_store_writer)
-    });
+    }
+
+    Ok(())
+}
+
+/// Spawns a tokio task which joins the parallel column writer tasks,
+/// and finalizes the row group.
+fn spawn_rg_join_and_finalize_task(
+    column_writer_handles: Vec<JoinHandle<Result<ArrowColumnWriter>>>,
+    rg_rows: usize,
+) -> JoinHandle<RBStreamSerializeResult> {
+    tokio::spawn(async move {
+        let num_cols = column_writer_handles.len();
+        let mut finalized_rg = Vec::with_capacity(num_cols);
+        for handle in column_writer_handles.into_iter() {
+            match handle.await {
+                Ok(r) => {
+                    let w = r?;
+                    finalized_rg.push(w.close()?);
+                }
+                Err(e) => {
+                    if e.is_panic() {
+                        std::panic::resume_unwind(e.into_panic())
+                    } else {
+                        unreachable!()
+                    }
+                }
+            }
+        }
+
+        Ok((finalized_rg, rg_rows))
+    })
+}
+
+/// This task coordinates the serialization of a parquet file in parallel.
+/// As the query produces RecordBatches, these are written to a RowGroup
+/// via parallel [ArrowColumnWriter] tasks. Once the desired max rows per
+/// row group is reached, the parallel tasks are joined on another separate task
+/// and sent to a concatenation task. This task immediately continues to work
+/// on the next row group in parallel. So, parquet serialization is parallelized
+/// accross both columns and row_groups, with a theoretical max number of parallel tasks
+/// given by n_columns * num_row_groups.
+fn spawn_parquet_parallel_serialization_task(
+    mut data: Receiver<RecordBatch>,
+    serialize_tx: Sender<JoinHandle<RBStreamSerializeResult>>,
+    schema: Arc<Schema>,
+    writer_props: Arc<WriterProperties>,
+    parallel_options: ParallelParquetWriterOptions,
+) -> JoinHandle<Result<(), DataFusionError>> {
+    tokio::spawn(async move {
+        let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream;
+        let max_row_group_rows = writer_props.max_row_group_size();
+        let (mut column_writer_handles, mut col_array_channels) =
+            spawn_column_parallel_row_group_writer(
+                schema.clone(),
+                writer_props.clone(),
+                max_buffer_rb,
+            )?;
+        let mut current_rg_rows = 0;
+
+        while let Some(rb) = data.recv().await {
+            if current_rg_rows + rb.num_rows() < max_row_group_rows {
+                send_arrays_to_col_writers(&col_array_channels, &rb, schema.clone())
+                    .await?;
+                current_rg_rows += rb.num_rows();
+            } else {
+                let rows_left = max_row_group_rows - current_rg_rows;
+                let a = rb.slice(0, rows_left);
+                send_arrays_to_col_writers(&col_array_channels, &a, schema.clone())
+                    .await?;
+
+                // Signal the parallel column writers that the RowGroup is done, join and finalize RowGroup
+                // on a separate task, so that we can immediately start on the next RG before waiting
+                // for the current one to finish.
+                drop(col_array_channels);
+                let finalize_rg_task = spawn_rg_join_and_finalize_task(
+                    column_writer_handles,
+                    max_row_group_rows,
+                );
+
+                serialize_tx.send(finalize_rg_task).await.map_err(|_| {
+                    DataFusionError::Internal(
+                        "Unable to send closed RG to concat task!".into(),
+                    )
+                })?;
+
+                let b = rb.slice(rows_left, rb.num_rows() - rows_left);
+                (column_writer_handles, col_array_channels) =
+                    spawn_column_parallel_row_group_writer(
+                        schema.clone(),
+                        writer_props.clone(),
+                        max_buffer_rb,
+                    )?;
+                send_arrays_to_col_writers(&col_array_channels, &b, schema.clone())
+                    .await?;
+                current_rg_rows = b.num_rows();
+            }
+        }
+
+        drop(col_array_channels);
+        // Handle leftover rows as final rowgroup, which may be smaller than max_row_group_rows
+        if current_rg_rows > 0 {
+            let finalize_rg_task =
+                spawn_rg_join_and_finalize_task(column_writer_handles, current_rg_rows);
+
+            serialize_tx.send(finalize_rg_task).await.map_err(|_| {
+                DataFusionError::Internal(
+                    "Unable to send closed RG to concat task!".into(),
+                )
+            })?;
+        }
+
+        Ok(())
+    })
+}
+
+/// Consume RowGroups serialized by other parallel tasks and concatenate them in
+/// to the final parquet file, while flushing finalized bytes to an [ObjectStore]
+async fn concatenate_parallel_row_groups(
+    mut serialize_rx: Receiver<JoinHandle<RBStreamSerializeResult>>,
+    schema: Arc<Schema>,
+    writer_props: Arc<WriterProperties>,
+    mut object_store_writer: AbortableWrite<Box<dyn AsyncWrite + Send + Unpin>>,
+) -> Result<usize> {
     let merged_buff = SharedBuffer::new(1048576);
-    for handle in join_handles {
+
+    let schema_desc = arrow_to_parquet_schema(schema.as_ref())?;
+    let mut parquet_writer = SerializedFileWriter::new(
+        merged_buff.clone(),
+        schema_desc.root_schema_ptr(),
+        writer_props,
+    )?;
+
+    let mut row_count = 0;
+
+    while let Some(handle) = serialize_rx.recv().await {
         let join_result = handle.await;
         match join_result {
             Ok(result) => {
-                let (out, num_rows) = result?;
-                let reader = bytes::Bytes::from(out);
-                row_count += num_rows;
-                //let reader = File::open(buffer)?;
-                let metadata = parquet::file::footer::parse_metadata(&reader)?;
-                let schema = metadata.file_metadata().schema();
-                writer = match writer {
-                    Some(writer) => Some(writer),
-                    None => Some(SerializedFileWriter::new(
-                        merged_buff.clone(),
-                        Arc::new(schema.clone()),
-                        Arc::new(parquet_props.clone()),
-                    )?),
-                };
-
-                match &mut writer{
-                    Some(w) => {
-                        // Note: cannot use .await within this loop as RowGroupMetaData is not Send
-                        // Instead, use a non-blocking channel to send bytes to separate worker
-                        // which will write to ObjectStore.
-                        for rg in metadata.row_groups() {
-                            let mut rg_out = w.next_row_group()?;
-                            for column in rg.columns() {
-                                let result = ColumnCloseResult {
-                                    bytes_written: column.compressed_size() as _,
-                                    rows_written: rg.num_rows() as _,
-                                    metadata: column.clone(),
-                                    // TODO need to populate the indexes when writing final file
-                                    // see https://github.com/apache/arrow-datafusion/issues/7589
-                                    bloom_filter: None,
-                                    column_index: None,
-                                    offset_index: None,
-                                };
-                                rg_out.append_column(&reader, result)?;
-                                let mut buff_to_flush = merged_buff.buffer.try_lock().unwrap();
-                                if buff_to_flush.len() > 1024000{
-                                    let bytes: Vec<u8> = buff_to_flush.drain(..).collect();
-                                    tx.send(bytes).map_err(|_| DataFusionError::Execution("Failed to send bytes to ObjectStore writer".into()))?;
-
-                                }
-                            }
-                            rg_out.close()?;
-                            let mut buff_to_flush = merged_buff.buffer.try_lock().unwrap();
-                            if buff_to_flush.len() > 1024000{
-                                let bytes: Vec<u8> = buff_to_flush.drain(..).collect();
-                                tx.send(bytes).map_err(|_| DataFusionError::Execution("Failed to send bytes to ObjectStore writer".into()))?;
-                            }
-                        }
-                    },
-                    None => unreachable!("Parquet writer should always be initialized in first iteration of loop!")
+                let mut rg_out = parquet_writer.next_row_group()?;
+                let (serialized_columns, cnt) = result?;
+                row_count += cnt;
+                for chunk in serialized_columns {
+                    chunk.append_to_row_group(&mut rg_out)?;
+                    let mut buff_to_flush = merged_buff.buffer.try_lock().unwrap();
+                    if buff_to_flush.len() > 1024000 {
+                        object_store_writer
+                            .write_all(buff_to_flush.as_slice())
+                            .await?;
+                        buff_to_flush.clear();
+                    }
                 }
+                rg_out.close()?;
             }
             Err(e) => {
                 if e.is_panic() {
@@ -946,14 +1046,51 @@ async fn output_single_parquet_file_parallelized(
             }
         }
     }
-    let inner_writer = writer.unwrap().into_inner()?;
+
+    let inner_writer = parquet_writer.into_inner()?;
     let final_buff = inner_writer.buffer.try_lock().unwrap();
 
-    // Explicitly drop tx to signal to rx we are done sending data
-    drop(tx);
+    object_store_writer.write_all(final_buff.as_slice()).await?;
+    object_store_writer.shutdown().await?;
+
+    Ok(row_count)
+}
 
-    let mut object_store_writer = match writer_join_handle.await {
-        Ok(r) => r?,
+/// Parallelizes the serialization of a single parquet file, by first serializing N
+/// independent RecordBatch streams in parallel to RowGroups in memory. Another
+/// task then stitches these independent RowGroups together and streams this large
+/// single parquet file to an ObjectStore in multiple parts.
+async fn output_single_parquet_file_parallelized(
+    object_store_writer: AbortableWrite<Box<dyn AsyncWrite + Send + Unpin>>,
+    data: Receiver<RecordBatch>,
+    output_schema: Arc<Schema>,
+    parquet_props: &WriterProperties,
+    parallel_options: ParallelParquetWriterOptions,
+) -> Result<usize> {
+    let max_rowgroups = parallel_options.max_parallel_row_groups;
+    // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel
+    let (serialize_tx, serialize_rx) =
+        mpsc::channel::<JoinHandle<RBStreamSerializeResult>>(max_rowgroups);
+
+    let arc_props = Arc::new(parquet_props.clone());
+    let launch_serialization_task = spawn_parquet_parallel_serialization_task(
+        data,
+        serialize_tx,
+        output_schema.clone(),
+        arc_props.clone(),
+        parallel_options,
+    );
+    let row_count = concatenate_parallel_row_groups(
+        serialize_rx,
+        output_schema.clone(),
+        arc_props.clone(),
+        object_store_writer,
+    )
+    .await?;
+
+    match launch_serialization_task.await {
+        Ok(Ok(_)) => (),
+        Ok(Err(e)) => return Err(e),
         Err(e) => {
             if e.is_panic() {
                 std::panic::resume_unwind(e.into_panic())
@@ -962,8 +1099,6 @@ async fn output_single_parquet_file_parallelized(
             }
         }
     };
-    object_store_writer.write_all(final_buff.as_slice()).await?;
-    object_store_writer.shutdown().await?;
 
     Ok(row_count)
 }
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
index 4a2b6220fd85..ed85f54a39aa 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -153,7 +153,7 @@ datafusion.execution.collect_statistics false
 datafusion.execution.max_buffered_batches_per_output_file 2
 datafusion.execution.meta_fetch_concurrency 32
 datafusion.execution.minimum_parallel_output_files 4
-datafusion.execution.parquet.allow_single_file_parallelism false
+datafusion.execution.parquet.allow_single_file_parallelism true
 datafusion.execution.parquet.bloom_filter_enabled false
 datafusion.execution.parquet.bloom_filter_fpp NULL
 datafusion.execution.parquet.bloom_filter_ndv NULL
@@ -168,6 +168,8 @@ datafusion.execution.parquet.enable_page_index true
 datafusion.execution.parquet.encoding NULL
 datafusion.execution.parquet.max_row_group_size 1048576
 datafusion.execution.parquet.max_statistics_size NULL
+datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2
+datafusion.execution.parquet.maximum_parallel_row_group_writers 1
 datafusion.execution.parquet.metadata_size_hint NULL
 datafusion.execution.parquet.pruning true
 datafusion.execution.parquet.pushdown_filters false
@@ -223,7 +225,7 @@ datafusion.execution.collect_statistics false Should DataFusion collect statisti
 datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption
 datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics
 datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.
-datafusion.execution.parquet.allow_single_file_parallelism false Controls whether DataFusion will attempt to speed up writing large parquet files by first writing multiple smaller files and then stitching them together into a single large file. This will result in faster write speeds, but higher memory usage. Also currently unsupported are bloom filters and column indexes when single_file_parallelism is enabled.
+datafusion.execution.parquet.allow_single_file_parallelism true Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
 datafusion.execution.parquet.bloom_filter_enabled false Sets if bloom filter is enabled for any column
 datafusion.execution.parquet.bloom_filter_fpp NULL Sets bloom filter false positive probability. If NULL, uses default parquet writer setting
 datafusion.execution.parquet.bloom_filter_ndv NULL Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting
@@ -238,6 +240,8 @@ datafusion.execution.parquet.enable_page_index true If true, reads the Parquet d
 datafusion.execution.parquet.encoding NULL Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
 datafusion.execution.parquet.max_row_group_size 1048576 Sets maximum number of rows in a row group
 datafusion.execution.parquet.max_statistics_size NULL Sets max statistics size for any column. If NULL, uses default parquet writer setting
+datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
+datafusion.execution.parquet.maximum_parallel_row_group_writers 1 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.metadata_size_hint NULL If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
 datafusion.execution.parquet.pruning true If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
 datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 3476118ca645..c8ff1b06d609 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -35,70 +35,72 @@ Values are parsed according to the [same rules used in casts from Utf8](https://
 If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
 Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
 
-| key                                                        | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ---------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| datafusion.catalog.create_default_catalog_and_schema       | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.catalog.default_catalog                         | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.catalog.default_schema                          | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.information_schema                      | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.catalog.location                                | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.catalog.format                                  | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.has_header                              | false                     | If the file has a header                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.batch_size                            | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.coalesce_batches                      | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.collect_statistics                    | false                     | Should DataFusion collect statistics after listing files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.target_partitions                     | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.time_zone                             | +00:00                    | The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.enable_page_index             | true                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.pruning                       | true                      | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.skip_metadata                 | true                      | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.metadata_size_hint            | NULL                      | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.pushdown_filters              | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.parquet.reorder_filters               | false                     | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.parquet.data_pagesize_limit           | 1048576                   | Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.parquet.write_batch_size              | 1024                      | Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.writer_version                | 1.0                       | Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.parquet.compression                   | zstd(3)                   | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.execution.parquet.dictionary_enabled            | NULL                      | Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.dictionary_page_size_limit    | 1048576                   | Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.statistics_enabled            | NULL                      | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.execution.parquet.max_statistics_size           | NULL                      | Sets max statistics size for any column. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.parquet.max_row_group_size            | 1048576                   | Sets maximum number of rows in a row group                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.parquet.created_by                    | datafusion version 32.0.0 | Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.parquet.column_index_truncate_length  | NULL                      | Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.data_page_row_count_limit     | 18446744073709551615      | Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.encoding                      | NULL                      | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.bloom_filter_enabled          | false                     | Sets if bloom filter is enabled for any column                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.bloom_filter_fpp              | NULL                      | Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.parquet.bloom_filter_ndv              | NULL                      | Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.allow_single_file_parallelism | false                     | Controls whether DataFusion will attempt to speed up writing large parquet files by first writing multiple smaller files and then stitching them together into a single large file. This will result in faster write speeds, but higher memory usage. Also currently unsupported are bloom filters and column indexes when single_file_parallelism is enabled.                                                                                                                                                                                                                                          |
-| datafusion.execution.aggregate.scalar_update_factor        | 10                        | Specifies the threshold for using `ScalarValue`s to update accumulators during high-cardinality aggregations for each input batch. The aggregation is considered high-cardinality if the number of affected groups is greater than or equal to `batch_size / scalar_update_factor`. In such cases, `ScalarValue`s are utilized for updating accumulators, rather than the default batch-slice approach. This can lead to performance improvements. By adjusting the `scalar_update_factor`, you can balance the trade-off between more efficient accumulator updates and the number of groups affected. |
-| datafusion.execution.planning_concurrency                  | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.sort_spill_reservation_bytes          | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                       |
-| datafusion.execution.sort_in_place_threshold_bytes         | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.execution.meta_fetch_concurrency                | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.minimum_parallel_output_files         | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.soft_max_rows_per_output_file         | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.execution.max_buffered_batches_per_output_file  | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.optimizer.enable_round_robin_repartition        | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.optimizer.enable_topk_aggregation               | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.optimizer.filter_null_join_keys                 | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.repartition_aggregations              | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.optimizer.repartition_file_min_size             | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.optimizer.repartition_joins                     | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.optimizer.allow_symmetric_joins_without_pruning | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                |
-| datafusion.optimizer.repartition_file_scans                | true                      | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file.                                                                                                                                                                                          |
-| datafusion.optimizer.repartition_windows                   | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.optimizer.repartition_sorts                     | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                               |
-| datafusion.optimizer.prefer_existing_sort                  | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.skip_failed_rules                     | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.max_passes                            | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.top_down_join_key_reordering          | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| datafusion.optimizer.prefer_hash_join                      | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.optimizer.hash_join_single_partition_threshold  | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.explain.logical_plan_only                       | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.physical_plan_only                      | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.explain.show_statistics                         | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.sql_parser.parse_float_as_decimal               | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.sql_parser.enable_ident_normalization           | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.sql_parser.dialect                              | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| key                                                                     | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ----------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.catalog.create_default_catalog_and_schema                    | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.catalog.default_catalog                                      | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.catalog.default_schema                                       | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.catalog.information_schema                                   | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.catalog.location                                             | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.catalog.format                                               | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.catalog.has_header                                           | false                     | If the file has a header                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.execution.batch_size                                         | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.coalesce_batches                                   | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.collect_statistics                                 | false                     | Should DataFusion collect statistics after listing files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.execution.target_partitions                                  | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.execution.time_zone                                          | +00:00                    | The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.enable_page_index                          | true                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.pruning                                    | true                      | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.skip_metadata                              | true                      | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.parquet.metadata_size_hint                         | NULL                      | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.parquet.pushdown_filters                           | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.parquet.reorder_filters                            | false                     | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.parquet.write_batch_size                           | 1024                      | Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.writer_version                             | 1.0                       | Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.parquet.compression                                | zstd(3)                   | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.execution.parquet.dictionary_enabled                         | NULL                      | Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.parquet.statistics_enabled                         | NULL                      | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.execution.parquet.max_statistics_size                        | NULL                      | Sets max statistics size for any column. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | Sets maximum number of rows in a row group                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.created_by                                 | datafusion version 32.0.0 | Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.column_index_truncate_length               | NULL                      | Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.parquet.data_page_row_count_limit                  | 18446744073709551615      | Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.encoding                                   | NULL                      | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.bloom_filter_enabled                       | false                     | Sets if bloom filter is enabled for any column                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.bloom_filter_fpp                           | NULL                      | Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.bloom_filter_ndv                           | NULL                      | Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.allow_single_file_parallelism              | true                      | Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                    |
+| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                    |
+| datafusion.execution.aggregate.scalar_update_factor                     | 10                        | Specifies the threshold for using `ScalarValue`s to update accumulators during high-cardinality aggregations for each input batch. The aggregation is considered high-cardinality if the number of affected groups is greater than or equal to `batch_size / scalar_update_factor`. In such cases, `ScalarValue`s are utilized for updating accumulators, rather than the default batch-slice approach. This can lead to performance improvements. By adjusting the `scalar_update_factor`, you can balance the trade-off between more efficient accumulator updates and the number of groups affected. |
+| datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                       |
+| datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.execution.minimum_parallel_output_files                      | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.soft_max_rows_per_output_file                      | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.execution.max_buffered_batches_per_output_file               | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.optimizer.repartition_joins                                  | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.optimizer.allow_symmetric_joins_without_pruning              | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                |
+| datafusion.optimizer.repartition_file_scans                             | true                      | When set to `true`, file groups will be repartitioned to achieve maximum parallelism. Currently Parquet and CSV formats are supported. If set to `true`, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false`, different files will be read in parallel, but repartitioning won't happen within a single file.                                                                                                                                                                                          |
+| datafusion.optimizer.repartition_windows                                | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.repartition_sorts                                  | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                               |
+| datafusion.optimizer.prefer_existing_sort                               | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.skip_failed_rules                                  | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.max_passes                                         | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.top_down_join_key_reordering                       | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.optimizer.prefer_hash_join                                   | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.hash_join_single_partition_threshold               | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.explain.logical_plan_only                                    | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.explain.physical_plan_only                                   | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.explain.show_statistics                                      | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.sql_parser.parse_float_as_decimal                            | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.sql_parser.enable_ident_normalization                        | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.                                                                                                                                                                                                                                                                                                                                                                                                                  |

From d190ff16b7b4860f89052a30ca3acecfcde21060 Mon Sep 17 00:00:00 2001
From: Hengfei Yang <hengfei.yang@gmail.com>
Date: Thu, 26 Oct 2023 01:09:53 +0800
Subject: [PATCH 11/32] feat: Use bloom filter when reading parquet to skip row
 groups  (#7821)

* feat: implement read bloom filter support

* test: add unit test for read bloom filter

* Simplify bloom filter application

* test: add unit test for bloom filter with sql `in`

* fix: imrpove bloom filter match express

* fix: add more test for bloom filter

* ci: rollback dependences

* ci: merge main branch

* fix: unit tests for bloom filter

* ci: cargo clippy

* ci: cargo clippy

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 .../src/datasource/physical_plan/parquet.rs   |  45 +-
 .../physical_plan/parquet/row_groups.rs       | 572 +++++++++++++++++-
 .../sqllogictest/test_files/predicates.slt    |  40 ++
 3 files changed, 625 insertions(+), 32 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
index 6cab27b0846c..3a2459bec817 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -82,6 +82,9 @@ pub struct ParquetExec {
     /// Override for `Self::with_enable_page_index`. If None, uses
     /// values from base_config
     enable_page_index: Option<bool>,
+    /// Override for `Self::with_enable_bloom_filter`. If None, uses
+    /// values from base_config
+    enable_bloom_filter: Option<bool>,
     /// Base configuration for this scan
     base_config: FileScanConfig,
     projected_statistics: Statistics,
@@ -151,6 +154,7 @@ impl ParquetExec {
             pushdown_filters: None,
             reorder_filters: None,
             enable_page_index: None,
+            enable_bloom_filter: None,
             base_config,
             projected_schema,
             projected_statistics,
@@ -244,6 +248,18 @@ impl ParquetExec {
             .unwrap_or(config_options.execution.parquet.enable_page_index)
     }
 
+    /// If enabled, the reader will read by the bloom filter
+    pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> Self {
+        self.enable_bloom_filter = Some(enable_bloom_filter);
+        self
+    }
+
+    /// Return the value described in [`Self::with_enable_bloom_filter`]
+    fn enable_bloom_filter(&self, config_options: &ConfigOptions) -> bool {
+        self.enable_bloom_filter
+            .unwrap_or(config_options.execution.parquet.bloom_filter_enabled)
+    }
+
     /// Redistribute files across partitions according to their size
     /// See comments on `get_file_groups_repartitioned()` for more detail.
     pub fn get_repartitioned(
@@ -373,6 +389,7 @@ impl ExecutionPlan for ParquetExec {
             pushdown_filters: self.pushdown_filters(config_options),
             reorder_filters: self.reorder_filters(config_options),
             enable_page_index: self.enable_page_index(config_options),
+            enable_bloom_filter: self.enable_bloom_filter(config_options),
         };
 
         let stream =
@@ -406,6 +423,7 @@ struct ParquetOpener {
     pushdown_filters: bool,
     reorder_filters: bool,
     enable_page_index: bool,
+    enable_bloom_filter: bool,
 }
 
 impl FileOpener for ParquetOpener {
@@ -440,6 +458,7 @@ impl FileOpener for ParquetOpener {
             self.enable_page_index,
             &self.page_pruning_predicate,
         );
+        let enable_bloom_filter = self.enable_bloom_filter;
         let limit = self.limit;
 
         Ok(Box::pin(async move {
@@ -482,16 +501,32 @@ impl FileOpener for ParquetOpener {
                 };
             };
 
-            // Row group pruning: attempt to skip entire row_groups
+            // Row group pruning by statistics: attempt to skip entire row_groups
             // using metadata on the row groups
-            let file_metadata = builder.metadata();
-            let row_groups = row_groups::prune_row_groups(
+            let file_metadata = builder.metadata().clone();
+            let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
+            let mut row_groups = row_groups::prune_row_groups_by_statistics(
                 file_metadata.row_groups(),
                 file_range,
-                pruning_predicate.as_ref().map(|p| p.as_ref()),
+                predicate,
                 &file_metrics,
             );
 
+            // Bloom filter pruning: if bloom filters are enabled and then attempt to skip entire row_groups
+            // using bloom filters on the row groups
+            if enable_bloom_filter && !row_groups.is_empty() {
+                if let Some(predicate) = predicate {
+                    row_groups = row_groups::prune_row_groups_by_bloom_filters(
+                        &mut builder,
+                        &row_groups,
+                        file_metadata.row_groups(),
+                        predicate,
+                        &file_metrics,
+                    )
+                    .await;
+                }
+            }
+
             // page index pruning: if all data on individual pages can
             // be ruled using page metadata, rows from other columns
             // with that range can be skipped as well
@@ -567,7 +602,7 @@ impl DefaultParquetFileReaderFactory {
 }
 
 /// Implements [`AsyncFileReader`] for a parquet file in object storage
-struct ParquetFileReader {
+pub(crate) struct ParquetFileReader {
     file_metrics: ParquetFileMetrics,
     inner: ParquetObjectReader,
 }
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
index c6e2c68d0211..91bceed91602 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
@@ -19,24 +19,31 @@ use arrow::{
     array::ArrayRef,
     datatypes::{DataType, Schema},
 };
-use datafusion_common::Column;
-use datafusion_common::ScalarValue;
-use log::debug;
-
-use parquet::file::{
-    metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics,
+use datafusion_common::tree_node::{TreeNode, VisitRecursion};
+use datafusion_common::{Column, DataFusionError, Result, ScalarValue};
+use parquet::{
+    arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder},
+    bloom_filter::Sbbf,
+    file::{metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics},
 };
-
-use crate::datasource::physical_plan::parquet::{
-    from_bytes_to_i128, parquet_to_arrow_decimal_type,
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
 };
-use crate::{
-    datasource::listing::FileRange,
-    physical_optimizer::pruning::{PruningPredicate, PruningStatistics},
+
+use crate::datasource::{
+    listing::FileRange,
+    physical_plan::parquet::{from_bytes_to_i128, parquet_to_arrow_decimal_type},
 };
+use crate::logical_expr::Operator;
+use crate::physical_expr::expressions as phys_expr;
+use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+use crate::physical_plan::PhysicalExpr;
 
 use super::ParquetFileMetrics;
 
+/// Prune row groups based on statistics
+///
 /// Returns a vector of indexes into `groups` which should be scanned.
 ///
 /// If an index is NOT present in the returned Vec it means the
@@ -44,7 +51,7 @@ use super::ParquetFileMetrics;
 ///
 /// If an index IS present in the returned Vec it means the predicate
 /// did not filter out that row group.
-pub(crate) fn prune_row_groups(
+pub(crate) fn prune_row_groups_by_statistics(
     groups: &[RowGroupMetaData],
     range: Option<FileRange>,
     predicate: Option<&PruningPredicate>,
@@ -81,7 +88,7 @@ pub(crate) fn prune_row_groups(
                 // stats filter array could not be built
                 // return a closure which will not filter out any row groups
                 Err(e) => {
-                    debug!("Error evaluating row group predicate values {e}");
+                    log::debug!("Error evaluating row group predicate values {e}");
                     metrics.predicate_evaluation_errors.add(1);
                 }
             }
@@ -92,6 +99,203 @@ pub(crate) fn prune_row_groups(
     filtered
 }
 
+/// Prune row groups by bloom filters
+///
+/// Returns a vector of indexes into `groups` which should be scanned.
+///
+/// If an index is NOT present in the returned Vec it means the
+/// predicate filtered all the row group.
+///
+/// If an index IS present in the returned Vec it means the predicate
+/// did not filter out that row group.
+pub(crate) async fn prune_row_groups_by_bloom_filters<
+    T: AsyncFileReader + Send + 'static,
+>(
+    builder: &mut ParquetRecordBatchStreamBuilder<T>,
+    row_groups: &[usize],
+    groups: &[RowGroupMetaData],
+    predicate: &PruningPredicate,
+    metrics: &ParquetFileMetrics,
+) -> Vec<usize> {
+    let bf_predicates = match BloomFilterPruningPredicate::try_new(predicate.orig_expr())
+    {
+        Ok(predicates) => predicates,
+        Err(_) => {
+            return row_groups.to_vec();
+        }
+    };
+    let mut filtered = Vec::with_capacity(groups.len());
+    for idx in row_groups {
+        let rg_metadata = &groups[*idx];
+        // get all columns bloom filter
+        let mut column_sbbf =
+            HashMap::with_capacity(bf_predicates.required_columns.len());
+        for column_name in bf_predicates.required_columns.iter() {
+            let column_idx = match rg_metadata
+                .columns()
+                .iter()
+                .enumerate()
+                .find(|(_, column)| column.column_path().string().eq(column_name))
+            {
+                Some((column_idx, _)) => column_idx,
+                None => continue,
+            };
+            let bf = match builder
+                .get_row_group_column_bloom_filter(*idx, column_idx)
+                .await
+            {
+                Ok(bf) => match bf {
+                    Some(bf) => bf,
+                    None => {
+                        continue;
+                    }
+                },
+                Err(e) => {
+                    log::error!("Error evaluating row group predicate values when using BloomFilterPruningPredicate {e}");
+                    metrics.predicate_evaluation_errors.add(1);
+                    continue;
+                }
+            };
+            column_sbbf.insert(column_name.to_owned(), bf);
+        }
+        if bf_predicates.prune(&column_sbbf) {
+            metrics.row_groups_pruned.add(1);
+            continue;
+        }
+        filtered.push(*idx);
+    }
+    filtered
+}
+
+struct BloomFilterPruningPredicate {
+    /// Actual pruning predicate
+    predicate_expr: Option<phys_expr::BinaryExpr>,
+    /// The statistics required to evaluate this predicate
+    required_columns: Vec<String>,
+}
+
+impl BloomFilterPruningPredicate {
+    fn try_new(expr: &Arc<dyn PhysicalExpr>) -> Result<Self> {
+        let binary_expr = expr.as_any().downcast_ref::<phys_expr::BinaryExpr>();
+        match binary_expr {
+            Some(binary_expr) => {
+                let columns = Self::get_predicate_columns(expr);
+                Ok(Self {
+                    predicate_expr: Some(binary_expr.clone()),
+                    required_columns: columns.into_iter().collect(),
+                })
+            }
+            None => Err(DataFusionError::Execution(
+                "BloomFilterPruningPredicate only support binary expr".to_string(),
+            )),
+        }
+    }
+
+    fn prune(&self, column_sbbf: &HashMap<String, Sbbf>) -> bool {
+        Self::prune_expr_with_bloom_filter(self.predicate_expr.as_ref(), column_sbbf)
+    }
+
+    /// Return true if the `expr` can be proved not `true`
+    /// based on the bloom filter.
+    ///
+    /// We only checked `BinaryExpr` but it also support `InList`,
+    /// Because of the `optimizer` will convert `InList` to `BinaryExpr`.
+    fn prune_expr_with_bloom_filter(
+        expr: Option<&phys_expr::BinaryExpr>,
+        column_sbbf: &HashMap<String, Sbbf>,
+    ) -> bool {
+        let Some(expr) = expr else {
+            // unsupported predicate
+            return false;
+        };
+        match expr.op() {
+            Operator::And | Operator::Or => {
+                let left = Self::prune_expr_with_bloom_filter(
+                    expr.left().as_any().downcast_ref::<phys_expr::BinaryExpr>(),
+                    column_sbbf,
+                );
+                let right = Self::prune_expr_with_bloom_filter(
+                    expr.right()
+                        .as_any()
+                        .downcast_ref::<phys_expr::BinaryExpr>(),
+                    column_sbbf,
+                );
+                match expr.op() {
+                    Operator::And => left || right,
+                    Operator::Or => left && right,
+                    _ => false,
+                }
+            }
+            Operator::Eq => {
+                if let Some((col, val)) = Self::check_expr_is_col_equal_const(expr) {
+                    if let Some(sbbf) = column_sbbf.get(col.name()) {
+                        match val {
+                            ScalarValue::Utf8(Some(v)) => !sbbf.check(&v.as_str()),
+                            ScalarValue::Boolean(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Float64(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Float32(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Int64(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Int32(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Int16(Some(v)) => !sbbf.check(&v),
+                            ScalarValue::Int8(Some(v)) => !sbbf.check(&v),
+                            _ => false,
+                        }
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                }
+            }
+            _ => false,
+        }
+    }
+
+    fn get_predicate_columns(expr: &Arc<dyn PhysicalExpr>) -> HashSet<String> {
+        let mut columns = HashSet::new();
+        expr.apply(&mut |expr| {
+            if let Some(binary_expr) =
+                expr.as_any().downcast_ref::<phys_expr::BinaryExpr>()
+            {
+                if let Some((column, _)) =
+                    Self::check_expr_is_col_equal_const(binary_expr)
+                {
+                    columns.insert(column.name().to_string());
+                }
+            }
+            Ok(VisitRecursion::Continue)
+        })
+        // no way to fail as only Ok(VisitRecursion::Continue) is returned
+        .unwrap();
+
+        columns
+    }
+
+    fn check_expr_is_col_equal_const(
+        exr: &phys_expr::BinaryExpr,
+    ) -> Option<(phys_expr::Column, ScalarValue)> {
+        if Operator::Eq.ne(exr.op()) {
+            return None;
+        }
+
+        let left_any = exr.left().as_any();
+        let right_any = exr.right().as_any();
+        if let (Some(col), Some(liter)) = (
+            left_any.downcast_ref::<phys_expr::Column>(),
+            right_any.downcast_ref::<phys_expr::Literal>(),
+        ) {
+            return Some((col.clone(), liter.value().clone()));
+        }
+        if let (Some(liter), Some(col)) = (
+            left_any.downcast_ref::<phys_expr::Literal>(),
+            right_any.downcast_ref::<phys_expr::Column>(),
+        ) {
+            return Some((col.clone(), liter.value().clone()));
+        }
+        None
+    }
+}
+
 /// Wraps parquet statistics in a way
 /// that implements [`PruningStatistics`]
 struct RowGroupPruningStatistics<'a> {
@@ -246,14 +450,20 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::datasource::physical_plan::parquet::ParquetFileReader;
     use crate::physical_plan::metrics::ExecutionPlanMetricsSet;
     use arrow::datatypes::DataType::Decimal128;
     use arrow::datatypes::Schema;
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::ToDFSchema;
-    use datafusion_expr::{cast, col, lit, Expr};
+    use datafusion_common::{config::ConfigOptions, TableReference, ToDFSchema};
+    use datafusion_expr::{
+        builder::LogicalTableSource, cast, col, lit, AggregateUDF, Expr, ScalarUDF,
+        TableSource, WindowUDF,
+    };
     use datafusion_physical_expr::execution_props::ExecutionProps;
     use datafusion_physical_expr::{create_physical_expr, PhysicalExpr};
+    use datafusion_sql::planner::ContextProvider;
+    use parquet::arrow::async_reader::ParquetObjectReader;
     use parquet::basic::LogicalType;
     use parquet::data_type::{ByteArray, FixedLenByteArray};
     use parquet::{
@@ -329,7 +539,12 @@ mod tests {
 
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                &[rgm1, rgm2],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![1]
         );
     }
@@ -358,7 +573,12 @@ mod tests {
         // missing statistics for first row group mean that the result from the predicate expression
         // is null / undefined so the first row group can't be filtered out
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                &[rgm1, rgm2],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![0, 1]
         );
     }
@@ -400,7 +620,12 @@ mod tests {
         // the first row group is still filtered out because the predicate expression can be partially evaluated
         // when conditions are joined using AND
         assert_eq!(
-            prune_row_groups(groups, None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                groups,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![1]
         );
 
@@ -413,7 +638,12 @@ mod tests {
         // if conditions in predicate are joined with OR and an unsupported expression is used
         // this bypasses the entire predicate expression and no row groups are filtered out
         assert_eq!(
-            prune_row_groups(groups, None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                groups,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![0, 1]
         );
     }
@@ -456,7 +686,12 @@ mod tests {
         let metrics = parquet_file_metrics();
         // First row group was filtered out because it contains no null value on "c2".
         assert_eq!(
-            prune_row_groups(&groups, None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                &groups,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![1]
         );
     }
@@ -482,7 +717,12 @@ mod tests {
         // bool = NULL always evaluates to NULL (and thus will not
         // pass predicates. Ideally these should both be false
         assert_eq!(
-            prune_row_groups(&groups, None, Some(&pruning_predicate), &metrics),
+            prune_row_groups_by_statistics(
+                &groups,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
             vec![1]
         );
     }
@@ -535,7 +775,7 @@ mod tests {
         );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(
+            prune_row_groups_by_statistics(
                 &[rgm1, rgm2, rgm3],
                 None,
                 Some(&pruning_predicate),
@@ -598,7 +838,7 @@ mod tests {
         );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(
+            prune_row_groups_by_statistics(
                 &[rgm1, rgm2, rgm3, rgm4],
                 None,
                 Some(&pruning_predicate),
@@ -645,7 +885,7 @@ mod tests {
         );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(
+            prune_row_groups_by_statistics(
                 &[rgm1, rgm2, rgm3],
                 None,
                 Some(&pruning_predicate),
@@ -715,7 +955,7 @@ mod tests {
         );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(
+            prune_row_groups_by_statistics(
                 &[rgm1, rgm2, rgm3],
                 None,
                 Some(&pruning_predicate),
@@ -774,7 +1014,7 @@ mod tests {
         );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(
+            prune_row_groups_by_statistics(
                 &[rgm1, rgm2, rgm3],
                 None,
                 Some(&pruning_predicate),
@@ -846,4 +1086,282 @@ mod tests {
         let execution_props = ExecutionProps::new();
         create_physical_expr(expr, &df_schema, schema, &execution_props).unwrap()
     }
+
+    #[tokio::test]
+    async fn test_row_group_bloom_filter_pruning_predicate_simple_expr() {
+        // load parquet file
+        let testdata = datafusion_common::test_util::parquet_test_data();
+        let file_name = "data_index_bloom_encoding_stats.parquet";
+        let path = format!("{testdata}/{file_name}");
+        let data = bytes::Bytes::from(std::fs::read(path).unwrap());
+
+        // generate pruning predicate
+        let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
+        let expr = col(r#""String""#).eq(lit("Hello_Not_Exists"));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate =
+            PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
+
+        let row_groups = vec![0];
+        let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
+            file_name,
+            data,
+            &pruning_predicate,
+            &row_groups,
+        )
+        .await
+        .unwrap();
+        assert!(pruned_row_groups.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_row_group_bloom_filter_pruning_predicate_mutiple_expr() {
+        // load parquet file
+        let testdata = datafusion_common::test_util::parquet_test_data();
+        let file_name = "data_index_bloom_encoding_stats.parquet";
+        let path = format!("{testdata}/{file_name}");
+        let data = bytes::Bytes::from(std::fs::read(path).unwrap());
+
+        // generate pruning predicate
+        let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
+        let expr = lit("1").eq(lit("1")).and(
+            col(r#""String""#)
+                .eq(lit("Hello_Not_Exists"))
+                .or(col(r#""String""#).eq(lit("Hello_Not_Exists2"))),
+        );
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate =
+            PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
+
+        let row_groups = vec![0];
+        let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
+            file_name,
+            data,
+            &pruning_predicate,
+            &row_groups,
+        )
+        .await
+        .unwrap();
+        assert!(pruned_row_groups.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_row_group_bloom_filter_pruning_predicate_sql_in() {
+        // load parquet file
+        let testdata = datafusion_common::test_util::parquet_test_data();
+        let file_name = "data_index_bloom_encoding_stats.parquet";
+        let path = format!("{testdata}/{file_name}");
+        let data = bytes::Bytes::from(std::fs::read(path).unwrap());
+
+        // generate pruning predicate
+        let schema = Schema::new(vec![
+            Field::new("String", DataType::Utf8, false),
+            Field::new("String3", DataType::Utf8, false),
+        ]);
+        let sql =
+            "SELECT * FROM tbl WHERE \"String\" IN ('Hello_Not_Exists', 'Hello_Not_Exists2')";
+        let expr = sql_to_physical_plan(sql).unwrap();
+        let pruning_predicate =
+            PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
+
+        let row_groups = vec![0];
+        let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
+            file_name,
+            data,
+            &pruning_predicate,
+            &row_groups,
+        )
+        .await
+        .unwrap();
+        assert!(pruned_row_groups.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_row_group_bloom_filter_pruning_predicate_with_exists_value() {
+        // load parquet file
+        let testdata = datafusion_common::test_util::parquet_test_data();
+        let file_name = "data_index_bloom_encoding_stats.parquet";
+        let path = format!("{testdata}/{file_name}");
+        let data = bytes::Bytes::from(std::fs::read(path).unwrap());
+
+        // generate pruning predicate
+        let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]);
+        let expr = col(r#""String""#).eq(lit("Hello"));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate =
+            PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
+
+        let row_groups = vec![0];
+        let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
+            file_name,
+            data,
+            &pruning_predicate,
+            &row_groups,
+        )
+        .await
+        .unwrap();
+        assert_eq!(pruned_row_groups, row_groups);
+    }
+
+    #[tokio::test]
+    async fn test_row_group_bloom_filter_pruning_predicate_without_bloom_filter() {
+        // load parquet file
+        let testdata = datafusion_common::test_util::parquet_test_data();
+        let file_name = "alltypes_plain.parquet";
+        let path = format!("{testdata}/{file_name}");
+        let data = bytes::Bytes::from(std::fs::read(path).unwrap());
+
+        // generate pruning predicate
+        let schema = Schema::new(vec![Field::new("string_col", DataType::Utf8, false)]);
+        let expr = col(r#""string_col""#).eq(lit("0"));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate =
+            PruningPredicate::try_new(expr, Arc::new(schema)).unwrap();
+
+        let row_groups = vec![0];
+        let pruned_row_groups = test_row_group_bloom_filter_pruning_predicate(
+            file_name,
+            data,
+            &pruning_predicate,
+            &row_groups,
+        )
+        .await
+        .unwrap();
+        assert_eq!(pruned_row_groups, row_groups);
+    }
+
+    async fn test_row_group_bloom_filter_pruning_predicate(
+        file_name: &str,
+        data: bytes::Bytes,
+        pruning_predicate: &PruningPredicate,
+        row_groups: &[usize],
+    ) -> Result<Vec<usize>> {
+        use object_store::{ObjectMeta, ObjectStore};
+
+        let object_meta = ObjectMeta {
+            location: object_store::path::Path::parse(file_name).expect("creating path"),
+            last_modified: chrono::DateTime::from(std::time::SystemTime::now()),
+            size: data.len(),
+            e_tag: None,
+        };
+        let in_memory = object_store::memory::InMemory::new();
+        in_memory
+            .put(&object_meta.location, data)
+            .await
+            .expect("put parquet file into in memory object store");
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let file_metrics =
+            ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics);
+        let reader = ParquetFileReader {
+            inner: ParquetObjectReader::new(Arc::new(in_memory), object_meta),
+            file_metrics: file_metrics.clone(),
+        };
+        let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
+
+        let metadata = builder.metadata().clone();
+        let pruned_row_group = prune_row_groups_by_bloom_filters(
+            &mut builder,
+            row_groups,
+            metadata.row_groups(),
+            pruning_predicate,
+            &file_metrics,
+        )
+        .await;
+
+        Ok(pruned_row_group)
+    }
+
+    fn sql_to_physical_plan(sql: &str) -> Result<Arc<dyn PhysicalExpr>> {
+        use datafusion_optimizer::{
+            analyzer::Analyzer, optimizer::Optimizer, OptimizerConfig, OptimizerContext,
+        };
+        use datafusion_sql::{
+            planner::SqlToRel,
+            sqlparser::{ast::Statement, parser::Parser},
+        };
+        use sqlparser::dialect::GenericDialect;
+
+        // parse the SQL
+        let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ...
+        let ast: Vec<Statement> = Parser::parse_sql(&dialect, sql).unwrap();
+        let statement = &ast[0];
+
+        // create a logical query plan
+        let schema_provider = TestSchemaProvider::new();
+        let sql_to_rel = SqlToRel::new(&schema_provider);
+        let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap();
+
+        // hard code the return value of now()
+        let config = OptimizerContext::new().with_skip_failing_rules(false);
+        let analyzer = Analyzer::new();
+        let optimizer = Optimizer::new();
+        // analyze and optimize the logical plan
+        let plan = analyzer.execute_and_check(&plan, config.options(), |_, _| {})?;
+        let plan = optimizer.optimize(&plan, &config, |_, _| {})?;
+        // convert the logical plan into a physical plan
+        let exprs = plan.expressions();
+        let expr = &exprs[0];
+        let df_schema = plan.schema().as_ref().to_owned();
+        let tb_schema: Schema = df_schema.clone().into();
+        let execution_props = ExecutionProps::new();
+        create_physical_expr(expr, &df_schema, &tb_schema, &execution_props)
+    }
+
+    struct TestSchemaProvider {
+        options: ConfigOptions,
+        tables: HashMap<String, Arc<dyn TableSource>>,
+    }
+
+    impl TestSchemaProvider {
+        pub fn new() -> Self {
+            let mut tables = HashMap::new();
+            tables.insert(
+                "tbl".to_string(),
+                create_table_source(vec![Field::new(
+                    "String".to_string(),
+                    DataType::Utf8,
+                    false,
+                )]),
+            );
+
+            Self {
+                options: Default::default(),
+                tables,
+            }
+        }
+    }
+
+    impl ContextProvider for TestSchemaProvider {
+        fn get_table_source(&self, name: TableReference) -> Result<Arc<dyn TableSource>> {
+            match self.tables.get(name.table()) {
+                Some(table) => Ok(table.clone()),
+                _ => datafusion_common::plan_err!("Table not found: {}", name.table()),
+            }
+        }
+
+        fn get_function_meta(&self, _name: &str) -> Option<Arc<ScalarUDF>> {
+            None
+        }
+
+        fn get_aggregate_meta(&self, _name: &str) -> Option<Arc<AggregateUDF>> {
+            None
+        }
+
+        fn get_variable_type(&self, _variable_names: &[String]) -> Option<DataType> {
+            None
+        }
+
+        fn options(&self) -> &ConfigOptions {
+            &self.options
+        }
+
+        fn get_window_meta(&self, _name: &str) -> Option<Arc<WindowUDF>> {
+            None
+        }
+    }
+
+    fn create_table_source(fields: Vec<Field>) -> Arc<dyn TableSource> {
+        Arc::new(LogicalTableSource::new(Arc::new(Schema::new(fields))))
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt
index 937b4c2eccf6..d22b2ff953b7 100644
--- a/datafusion/sqllogictest/test_files/predicates.slt
+++ b/datafusion/sqllogictest/test_files/predicates.slt
@@ -480,3 +480,43 @@ select * from t where (i & 3) = 1;
 ########
 statement ok
 DROP TABLE t;
+
+
+########
+# Test query with bloom filter
+# Refer to https://github.com/apache/arrow-datafusion/pull/7821#pullrequestreview-1688062599
+########
+
+statement ok
+CREATE EXTERNAL TABLE data_index_bloom_encoding_stats STORED AS PARQUET LOCATION '../../parquet-testing/data/data_index_bloom_encoding_stats.parquet';
+
+statement ok
+set datafusion.execution.parquet.bloom_filter_enabled=true;
+
+query T
+SELECT * FROM data_index_bloom_encoding_stats WHERE "String" = 'foo';
+
+query T
+SELECT * FROM data_index_bloom_encoding_stats WHERE "String" = 'test';
+----
+test
+
+query T
+SELECT * FROM data_index_bloom_encoding_stats WHERE "String" like '%e%';
+----
+Hello
+test
+are you
+the quick
+over
+the lazy
+
+statement ok
+set datafusion.execution.parquet.bloom_filter_enabled=false;
+
+
+########
+# Clean up after the test
+########
+statement ok
+DROP TABLE data_index_bloom_encoding_stats;

From 48ea4b246b85afb58ebb6c375fd233928db71784 Mon Sep 17 00:00:00 2001
From: Huaijin <haohuaijin@gmail.com>
Date: Thu, 26 Oct 2023 03:30:18 +0800
Subject: [PATCH 12/32] fix: don't push down volatile predicates in projection
 (#7909)

* fix: don't push down volatile predicates in projection

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* add suggestions

* fix

* fix doc

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Jonah Gao <jonahgaox@gmail.com>

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Jonah Gao <jonahgaox@gmail.com>

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Jonah Gao <jonahgaox@gmail.com>

* Update datafusion/optimizer/src/push_down_filter.rs

Co-authored-by: Jonah Gao <jonahgaox@gmail.com>

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
Co-authored-by: Jonah Gao <jonahgaox@gmail.com>
---
 datafusion/optimizer/src/push_down_filter.rs | 194 ++++++++++++++++---
 1 file changed, 167 insertions(+), 27 deletions(-)

diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index 4c5cd3ab2855..8c2eb96a48d8 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -15,13 +15,14 @@
 //! Push Down Filter optimizer rule ensures that filters are applied as early as possible in the plan
 
 use crate::optimizer::ApplyOrder;
-use crate::utils::{conjunction, split_conjunction};
+use crate::utils::{conjunction, split_conjunction, split_conjunction_owned};
 use crate::{utils, OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion};
 use datafusion_common::{
     internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result,
 };
 use datafusion_expr::expr::Alias;
+use datafusion_expr::Volatility;
 use datafusion_expr::{
     and,
     expr_rewriter::replace_col,
@@ -652,32 +653,60 @@ impl OptimizerRule for PushDownFilter {
                 child_plan.with_new_inputs(&[new_filter])?
             }
             LogicalPlan::Projection(projection) => {
-                // A projection is filter-commutable, but re-writes all predicate expressions
+                // A projection is filter-commutable if it do not contain volatile predicates or contain volatile
+                // predicates that are not used in the filter. However, we should re-writes all predicate expressions.
                 // collect projection.
-                let replace_map = projection
-                    .schema
-                    .fields()
-                    .iter()
-                    .enumerate()
-                    .map(|(i, field)| {
-                        // strip alias, as they should not be part of filters
-                        let expr = match &projection.expr[i] {
-                            Expr::Alias(Alias { expr, .. }) => expr.as_ref().clone(),
-                            expr => expr.clone(),
-                        };
-
-                        (field.qualified_name(), expr)
-                    })
-                    .collect::<HashMap<_, _>>();
+                let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) =
+                    projection
+                        .schema
+                        .fields()
+                        .iter()
+                        .enumerate()
+                        .map(|(i, field)| {
+                            // strip alias, as they should not be part of filters
+                            let expr = match &projection.expr[i] {
+                                Expr::Alias(Alias { expr, .. }) => expr.as_ref().clone(),
+                                expr => expr.clone(),
+                            };
+
+                            (field.qualified_name(), expr)
+                        })
+                        .partition(|(_, value)| is_volatile_expression(value));
 
-                // re-write all filters based on this projection
-                // E.g. in `Filter: b\n  Projection: a > 1 as b`, we can swap them, but the filter must be "a > 1"
-                let new_filter = LogicalPlan::Filter(Filter::try_new(
-                    replace_cols_by_name(filter.predicate.clone(), &replace_map)?,
-                    projection.input.clone(),
-                )?);
+                let mut push_predicates = vec![];
+                let mut keep_predicates = vec![];
+                for expr in split_conjunction_owned(filter.predicate.clone()).into_iter()
+                {
+                    if contain(&expr, &volatile_map) {
+                        keep_predicates.push(expr);
+                    } else {
+                        push_predicates.push(expr);
+                    }
+                }
 
-                child_plan.with_new_inputs(&[new_filter])?
+                match conjunction(push_predicates) {
+                    Some(expr) => {
+                        // re-write all filters based on this projection
+                        // E.g. in `Filter: b\n  Projection: a > 1 as b`, we can swap them, but the filter must be "a > 1"
+                        let new_filter = LogicalPlan::Filter(Filter::try_new(
+                            replace_cols_by_name(expr, &non_volatile_map)?,
+                            projection.input.clone(),
+                        )?);
+
+                        match conjunction(keep_predicates) {
+                            None => child_plan.with_new_inputs(&[new_filter])?,
+                            Some(keep_predicate) => {
+                                let child_plan =
+                                    child_plan.with_new_inputs(&[new_filter])?;
+                                LogicalPlan::Filter(Filter::try_new(
+                                    keep_predicate,
+                                    Arc::new(child_plan),
+                                )?)
+                            }
+                        }
+                    }
+                    None => return Ok(None),
+                }
             }
             LogicalPlan::Union(union) => {
                 let mut inputs = Vec::with_capacity(union.inputs.len());
@@ -881,6 +910,42 @@ pub fn replace_cols_by_name(
     })
 }
 
+/// check whether the expression is volatile predicates
+fn is_volatile_expression(e: &Expr) -> bool {
+    let mut is_volatile = false;
+    e.apply(&mut |expr| {
+        Ok(match expr {
+            Expr::ScalarFunction(f) if f.fun.volatility() == Volatility::Volatile => {
+                is_volatile = true;
+                VisitRecursion::Stop
+            }
+            _ => VisitRecursion::Continue,
+        })
+    })
+    .unwrap();
+    is_volatile
+}
+
+/// check whether the expression uses the columns in `check_map`.
+fn contain(e: &Expr, check_map: &HashMap<String, Expr>) -> bool {
+    let mut is_contain = false;
+    e.apply(&mut |expr| {
+        Ok(if let Expr::Column(c) = &expr {
+            match check_map.get(&c.flat_name()) {
+                Some(_) => {
+                    is_contain = true;
+                    VisitRecursion::Stop
+                }
+                None => VisitRecursion::Continue,
+            }
+        } else {
+            VisitRecursion::Continue
+        })
+    })
+    .unwrap();
+    is_contain
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -893,9 +958,9 @@ mod tests {
     use datafusion_common::{DFSchema, DFSchemaRef};
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        and, col, in_list, in_subquery, lit, logical_plan::JoinType, or, sum, BinaryExpr,
-        Expr, Extension, LogicalPlanBuilder, Operator, TableSource, TableType,
-        UserDefinedLogicalNodeCore,
+        and, col, in_list, in_subquery, lit, logical_plan::JoinType, or, random, sum,
+        BinaryExpr, Expr, Extension, LogicalPlanBuilder, Operator, TableSource,
+        TableType, UserDefinedLogicalNodeCore,
     };
     use std::fmt::{Debug, Formatter};
     use std::sync::Arc;
@@ -2712,4 +2777,79 @@ Projection: a, b
         \n    TableScan: test2";
         assert_optimized_plan_eq(&plan, expected)
     }
+
+    #[test]
+    fn test_push_down_volatile_function_in_aggregate() -> Result<()> {
+        // SELECT t.a, t.r FROM (SELECT a, SUM(b), random()+1 AS r FROM test1 GROUP BY a) AS t WHERE t.a > 5 AND t.r > 0.5;
+        let table_scan = test_table_scan_with_name("test1")?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![sum(col("b"))])?
+            .project(vec![
+                col("a"),
+                sum(col("b")),
+                add(random(), lit(1)).alias("r"),
+            ])?
+            .alias("t")?
+            .filter(col("t.a").gt(lit(5)).and(col("t.r").gt(lit(0.5))))?
+            .project(vec![col("t.a"), col("t.r")])?
+            .build()?;
+
+        let expected_before = "Projection: t.a, t.r\
+        \n  Filter: t.a > Int32(5) AND t.r > Float64(0.5)\
+        \n    SubqueryAlias: t\
+        \n      Projection: test1.a, SUM(test1.b), random() + Int32(1) AS r\
+        \n        Aggregate: groupBy=[[test1.a]], aggr=[[SUM(test1.b)]]\
+        \n          TableScan: test1";
+        assert_eq!(format!("{plan:?}"), expected_before);
+
+        let expected_after = "Projection: t.a, t.r\
+        \n  SubqueryAlias: t\
+        \n    Filter: r > Float64(0.5)\
+        \n      Projection: test1.a, SUM(test1.b), random() + Int32(1) AS r\
+        \n        Aggregate: groupBy=[[test1.a]], aggr=[[SUM(test1.b)]]\
+        \n          TableScan: test1, full_filters=[test1.a > Int32(5)]";
+        assert_optimized_plan_eq(&plan, expected_after)
+    }
+
+    #[test]
+    fn test_push_down_volatile_function_in_join() -> Result<()> {
+        // SELECT t.a, t.r FROM (SELECT test1.a AS a, random() AS r FROM test1 join test2 ON test1.a = test2.a) AS t WHERE t.r > 0.5;
+        let table_scan = test_table_scan_with_name("test1")?;
+        let left = LogicalPlanBuilder::from(table_scan).build()?;
+        let right_table_scan = test_table_scan_with_name("test2")?;
+        let right = LogicalPlanBuilder::from(right_table_scan).build()?;
+        let plan = LogicalPlanBuilder::from(left)
+            .join(
+                right,
+                JoinType::Inner,
+                (
+                    vec![Column::from_qualified_name("test1.a")],
+                    vec![Column::from_qualified_name("test2.a")],
+                ),
+                None,
+            )?
+            .project(vec![col("test1.a").alias("a"), random().alias("r")])?
+            .alias("t")?
+            .filter(col("t.r").gt(lit(0.8)))?
+            .project(vec![col("t.a"), col("t.r")])?
+            .build()?;
+
+        let expected_before = "Projection: t.a, t.r\
+        \n  Filter: t.r > Float64(0.8)\
+        \n    SubqueryAlias: t\
+        \n      Projection: test1.a AS a, random() AS r\
+        \n        Inner Join: test1.a = test2.a\
+        \n          TableScan: test1\
+        \n          TableScan: test2";
+        assert_eq!(format!("{plan:?}"), expected_before);
+
+        let expected = "Projection: t.a, t.r\
+        \n  SubqueryAlias: t\
+        \n    Filter: r > Float64(0.8)\
+        \n      Projection: test1.a AS a, random() AS r\
+        \n        Inner Join: test1.a = test2.a\
+        \n          TableScan: test1\
+        \n          TableScan: test2";
+        assert_optimized_plan_eq(&plan, expected)
+    }
 }

From 12a63163a9ed872a8df82f8a23e6c4253f3eb8b9 Mon Sep 17 00:00:00 2001
From: Chih Wang <ongchi@users.noreply.github.com>
Date: Thu, 26 Oct 2023 05:06:47 +0800
Subject: [PATCH 13/32] Add `parquet` feature flag, enabled by default, and
 make parquet conditional  (#7745)

* Make parquet an option by adding multiple cfg attributes without significant code changes.

* Extract parquet logic into submodule from execution::context

* Extract parquet logic into submodule from datafusion_core::dataframe

* Extract more logic into submodule from execution::context

* Move tests from execution::context

* Rename submodules
---
 datafusion-cli/Cargo.toml                     |   2 +-
 datafusion/common/Cargo.toml                  |   3 +-
 datafusion/common/src/test_util.rs            |   1 +
 datafusion/core/Cargo.toml                    |  10 +-
 .../src/{dataframe.rs => dataframe/mod.rs}    | 151 +-------
 datafusion/core/src/dataframe/parquet.rs      | 162 ++++++++
 .../file_format/file_compression_type.rs      |  18 +-
 .../core/src/datasource/file_format/mod.rs    |   1 +
 .../src/datasource/file_format/options.rs     |   7 +-
 .../core/src/datasource/listing/table.rs      |  13 +-
 .../src/datasource/listing_table_factory.rs   |   8 +-
 datafusion/core/src/datasource/mod.rs         |   1 +
 .../core/src/datasource/physical_plan/mod.rs  |   6 +-
 datafusion/core/src/execution/context/avro.rs |  83 ++++
 datafusion/core/src/execution/context/csv.rs  | 143 +++++++
 datafusion/core/src/execution/context/json.rs |  69 ++++
 .../execution/{context.rs => context/mod.rs}  | 363 +-----------------
 .../core/src/execution/context/parquet.rs     | 154 ++++++++
 datafusion/core/src/lib.rs                    |   1 +
 .../enforce_distribution.rs                   |  67 +++-
 datafusion/core/src/physical_planner.rs       |   2 +
 datafusion/core/src/test_util/mod.rs          |  77 +++-
 datafusion/proto/Cargo.toml                   |   3 +-
 datafusion/proto/src/logical_plan/mod.rs      |  66 ++--
 datafusion/proto/src/physical_plan/mod.rs     | 222 ++++++-----
 25 files changed, 994 insertions(+), 639 deletions(-)
 rename datafusion/core/src/{dataframe.rs => dataframe/mod.rs} (94%)
 create mode 100644 datafusion/core/src/dataframe/parquet.rs
 create mode 100644 datafusion/core/src/execution/context/avro.rs
 create mode 100644 datafusion/core/src/execution/context/csv.rs
 create mode 100644 datafusion/core/src/execution/context/json.rs
 rename datafusion/core/src/execution/{context.rs => context/mod.rs} (87%)
 create mode 100644 datafusion/core/src/execution/context/parquet.rs

diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 64e094437c5f..7dd9cb8bcb37 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -34,7 +34,7 @@ async-trait = "0.1.41"
 aws-config = "0.55"
 aws-credential-types = "0.55"
 clap = { version = "3", features = ["derive", "cargo"] }
-datafusion = { path = "../datafusion/core", version = "32.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", "compression"] }
+datafusion = { path = "../datafusion/core", version = "32.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "parquet", "regex_expressions", "unicode_expressions", "compression"] }
 dirs = "4.0.0"
 env_logger = "0.9"
 mimalloc = { version = "0.1", default-features = false }
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index 047c502d5cc2..490fbeacad85 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -35,8 +35,7 @@ path = "src/lib.rs"
 [features]
 avro = ["apache-avro"]
 backtrace = []
-default = ["parquet"]
-pyarrow = ["pyo3", "arrow/pyarrow"]
+pyarrow = ["pyo3", "arrow/pyarrow", "parquet"]
 
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs
index 60f1df7fd11a..9a4433782157 100644
--- a/datafusion/common/src/test_util.rs
+++ b/datafusion/common/src/test_util.rs
@@ -180,6 +180,7 @@ pub fn arrow_test_data() -> String {
 /// let filename = format!("{}/binary.parquet", testdata);
 /// assert!(std::path::PathBuf::from(filename).exists());
 /// ```
+#[cfg(feature = "parquet")]
 pub fn parquet_test_data() -> String {
     match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") {
         Ok(pb) => pb.display().to_string(),
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 30e0d005e92e..5f9d28bd620b 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -39,11 +39,12 @@ avro = ["apache-avro", "num-traits", "datafusion-common/avro"]
 backtrace = ["datafusion-common/backtrace"]
 compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression"]
 crypto_expressions = ["datafusion-physical-expr/crypto_expressions", "datafusion-optimizer/crypto_expressions"]
-default = ["crypto_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", "compression"]
+default = ["crypto_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", "compression", "parquet"]
 encoding_expressions = ["datafusion-physical-expr/encoding_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = []
-pyarrow = ["datafusion-common/pyarrow"]
+parquet = ["datafusion-common/parquet", "dep:parquet"]
+pyarrow = ["datafusion-common/pyarrow", "parquet"]
 regex_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion-optimizer/regex_expressions"]
 serde = ["arrow-schema/serde"]
 simd = ["arrow/simd"]
@@ -61,7 +62,7 @@ bytes = "1.4"
 bzip2 = { version = "0.4.3", optional = true }
 chrono = { workspace = true }
 dashmap = "5.4.0"
-datafusion-common = { path = "../common", version = "32.0.0", features = ["parquet", "object_store"] }
+datafusion-common = { path = "../common", version = "32.0.0", features = ["object_store"], default-features = false }
 datafusion-execution = { path = "../execution", version = "32.0.0" }
 datafusion-expr = { path = "../expr", version = "32.0.0" }
 datafusion-optimizer = { path = "../optimizer", version = "32.0.0", default-features = false }
@@ -80,7 +81,7 @@ num-traits = { version = "0.2", optional = true }
 num_cpus = "1.13.0"
 object_store = "0.7.0"
 parking_lot = "0.12"
-parquet = { workspace = true }
+parquet = { workspace = true, optional = true }
 percent-encoding = "2.2.0"
 pin-project-lite = "^0.2.7"
 rand = "0.8"
@@ -93,7 +94,6 @@ uuid = { version = "1.0", features = ["v4"] }
 xz2 = { version = "0.1", optional = true }
 zstd = { version = "0.13", optional = true, default-features = false }
 
-
 [dev-dependencies]
 async-trait = "0.1.53"
 bigdecimal = "0.4.1"
diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe/mod.rs
similarity index 94%
rename from datafusion/core/src/dataframe.rs
rename to datafusion/core/src/dataframe/mod.rs
index 2e192c2a782e..0a99c331826c 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -17,6 +17,9 @@
 
 //! [`DataFrame`] API for building and executing query plans.
 
+#[cfg(feature = "parquet")]
+mod parquet;
+
 use std::any::Any;
 use std::sync::Arc;
 
@@ -27,15 +30,11 @@ use arrow::datatypes::{DataType, Field};
 use async_trait::async_trait;
 use datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::file_options::json_writer::JsonWriterOptions;
-use datafusion_common::file_options::parquet_writer::{
-    default_builder, ParquetWriterOptions,
-};
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
     DataFusionError, FileType, FileTypeWriterOptions, SchemaError, UnnestOptions,
 };
 use datafusion_expr::dml::CopyOptions;
-use parquet::file::properties::WriterProperties;
 
 use datafusion_common::{Column, DFSchema, ScalarValue};
 use datafusion_expr::{
@@ -1067,40 +1066,6 @@ impl DataFrame {
         DataFrame::new(self.session_state, plan).collect().await
     }
 
-    /// Write a `DataFrame` to a Parquet file.
-    pub async fn write_parquet(
-        self,
-        path: &str,
-        options: DataFrameWriteOptions,
-        writer_properties: Option<WriterProperties>,
-    ) -> Result<Vec<RecordBatch>, DataFusionError> {
-        if options.overwrite {
-            return Err(DataFusionError::NotImplemented(
-                "Overwrites are not implemented for DataFrame::write_parquet.".to_owned(),
-            ));
-        }
-        match options.compression{
-            CompressionTypeVariant::UNCOMPRESSED => (),
-            _ => return Err(DataFusionError::Configuration("DataFrame::write_parquet method does not support compression set via DataFrameWriteOptions. Set parquet compression via writer_properties instead.".to_owned()))
-        }
-        let props = match writer_properties {
-            Some(props) => props,
-            None => default_builder(self.session_state.config_options())?.build(),
-        };
-        let file_type_writer_options =
-            FileTypeWriterOptions::Parquet(ParquetWriterOptions::new(props));
-        let copy_options = CopyOptions::WriterOptions(Box::new(file_type_writer_options));
-        let plan = LogicalPlanBuilder::copy_to(
-            self.plan,
-            path.into(),
-            FileType::PARQUET,
-            options.single_file_output,
-            copy_options,
-        )?
-        .build()?;
-        DataFrame::new(self.session_state, plan).collect().await
-    }
-
     /// Executes a query and writes the results to a partitioned JSON file.
     pub async fn write_json(
         self,
@@ -1365,19 +1330,12 @@ mod tests {
         WindowFunction,
     };
     use datafusion_physical_expr::expressions::Column;
-    use object_store::local::LocalFileSystem;
-    use parquet::basic::{BrotliLevel, GzipLevel, ZstdLevel};
-    use parquet::file::reader::FileReader;
-    use tempfile::TempDir;
-    use url::Url;
 
     use crate::execution::context::SessionConfig;
-    use crate::execution::options::{CsvReadOptions, ParquetReadOptions};
     use crate::physical_plan::ColumnarValue;
     use crate::physical_plan::Partitioning;
     use crate::physical_plan::PhysicalExpr;
-    use crate::test_util;
-    use crate::test_util::parquet_test_data;
+    use crate::test_util::{register_aggregate_csv, test_table, test_table_with_name};
     use crate::{assert_batches_sorted_eq, execution::context::SessionContext};
 
     use super::*;
@@ -1798,31 +1756,6 @@ mod tests {
         Ok(ctx.sql(sql).await?.into_unoptimized_plan())
     }
 
-    async fn test_table_with_name(name: &str) -> Result<DataFrame> {
-        let mut ctx = SessionContext::new();
-        register_aggregate_csv(&mut ctx, name).await?;
-        ctx.table(name).await
-    }
-
-    async fn test_table() -> Result<DataFrame> {
-        test_table_with_name("aggregate_test_100").await
-    }
-
-    async fn register_aggregate_csv(
-        ctx: &mut SessionContext,
-        table_name: &str,
-    ) -> Result<()> {
-        let schema = test_util::aggr_test_schema();
-        let testdata = test_util::arrow_test_data();
-        ctx.register_csv(
-            table_name,
-            &format!("{testdata}/csv/aggregate_test_100.csv"),
-            CsvReadOptions::new().schema(schema.as_ref()),
-        )
-        .await?;
-        Ok(())
-    }
-
     #[tokio::test]
     async fn with_column() -> Result<()> {
         let df = test_table().await?.select_columns(&["c1", "c2", "c3"])?;
@@ -2227,33 +2160,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn filter_pushdown_dataframe() -> Result<()> {
-        let ctx = SessionContext::new();
-
-        ctx.register_parquet(
-            "test",
-            &format!("{}/alltypes_plain.snappy.parquet", parquet_test_data()),
-            ParquetReadOptions::default(),
-        )
-        .await?;
-
-        ctx.register_table("t1", ctx.table("test").await?.into_view())?;
-
-        let df = ctx
-            .table("t1")
-            .await?
-            .filter(col("id").eq(lit(1)))?
-            .select_columns(&["bool_col", "int_col"])?;
-
-        let plan = df.explain(false, false)?.collect().await?;
-        // Filters all the way to Parquet
-        let formatted = pretty::pretty_format_batches(&plan)?.to_string();
-        assert!(formatted.contains("FilterExec: id@0 = 1"));
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn cast_expr_test() -> Result<()> {
         let df = test_table()
@@ -2538,53 +2444,4 @@ mod tests {
 
         Ok(())
     }
-
-    #[tokio::test]
-    async fn write_parquet_with_compression() -> Result<()> {
-        let test_df = test_table().await?;
-
-        let output_path = "file://local/test.parquet";
-        let test_compressions = vec![
-            parquet::basic::Compression::SNAPPY,
-            parquet::basic::Compression::LZ4,
-            parquet::basic::Compression::LZ4_RAW,
-            parquet::basic::Compression::GZIP(GzipLevel::default()),
-            parquet::basic::Compression::BROTLI(BrotliLevel::default()),
-            parquet::basic::Compression::ZSTD(ZstdLevel::default()),
-        ];
-        for compression in test_compressions.into_iter() {
-            let df = test_df.clone();
-            let tmp_dir = TempDir::new()?;
-            let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
-            let local_url = Url::parse("file://local").unwrap();
-            let ctx = &test_df.session_state;
-            ctx.runtime_env().register_object_store(&local_url, local);
-            df.write_parquet(
-                output_path,
-                DataFrameWriteOptions::new().with_single_file_output(true),
-                Some(
-                    WriterProperties::builder()
-                        .set_compression(compression)
-                        .build(),
-                ),
-            )
-            .await?;
-
-            // Check that file actually used the specified compression
-            let file = std::fs::File::open(tmp_dir.into_path().join("test.parquet"))?;
-
-            let reader =
-                parquet::file::serialized_reader::SerializedFileReader::new(file)
-                    .unwrap();
-
-            let parquet_metadata = reader.metadata();
-
-            let written_compression =
-                parquet_metadata.row_group(0).column(0).compression();
-
-            assert_eq!(written_compression, compression);
-        }
-
-        Ok(())
-    }
 }
diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs
new file mode 100644
index 000000000000..36ef90c987e3
--- /dev/null
+++ b/datafusion/core/src/dataframe/parquet.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::file_options::parquet_writer::{
+    default_builder, ParquetWriterOptions,
+};
+use parquet::file::properties::WriterProperties;
+
+use super::{
+    CompressionTypeVariant, CopyOptions, DataFrame, DataFrameWriteOptions,
+    DataFusionError, FileType, FileTypeWriterOptions, LogicalPlanBuilder, RecordBatch,
+};
+
+impl DataFrame {
+    /// Write a `DataFrame` to a Parquet file.
+    pub async fn write_parquet(
+        self,
+        path: &str,
+        options: DataFrameWriteOptions,
+        writer_properties: Option<WriterProperties>,
+    ) -> Result<Vec<RecordBatch>, DataFusionError> {
+        if options.overwrite {
+            return Err(DataFusionError::NotImplemented(
+                "Overwrites are not implemented for DataFrame::write_parquet.".to_owned(),
+            ));
+        }
+        match options.compression{
+            CompressionTypeVariant::UNCOMPRESSED => (),
+            _ => return Err(DataFusionError::Configuration("DataFrame::write_parquet method does not support compression set via DataFrameWriteOptions. Set parquet compression via writer_properties instead.".to_owned()))
+        }
+        let props = match writer_properties {
+            Some(props) => props,
+            None => default_builder(self.session_state.config_options())?.build(),
+        };
+        let file_type_writer_options =
+            FileTypeWriterOptions::Parquet(ParquetWriterOptions::new(props));
+        let copy_options = CopyOptions::WriterOptions(Box::new(file_type_writer_options));
+        let plan = LogicalPlanBuilder::copy_to(
+            self.plan,
+            path.into(),
+            FileType::PARQUET,
+            options.single_file_output,
+            copy_options,
+        )?
+        .build()?;
+        DataFrame::new(self.session_state, plan).collect().await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use object_store::local::LocalFileSystem;
+    use parquet::basic::{BrotliLevel, GzipLevel, ZstdLevel};
+    use parquet::file::reader::FileReader;
+    use tempfile::TempDir;
+    use url::Url;
+
+    use datafusion_expr::{col, lit};
+
+    use crate::arrow::util::pretty;
+    use crate::execution::context::SessionContext;
+    use crate::execution::options::ParquetReadOptions;
+    use crate::test_util;
+
+    use super::super::Result;
+    use super::*;
+
+    #[tokio::test]
+    async fn filter_pushdown_dataframe() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        ctx.register_parquet(
+            "test",
+            &format!(
+                "{}/alltypes_plain.snappy.parquet",
+                test_util::parquet_test_data()
+            ),
+            ParquetReadOptions::default(),
+        )
+        .await?;
+
+        ctx.register_table("t1", ctx.table("test").await?.into_view())?;
+
+        let df = ctx
+            .table("t1")
+            .await?
+            .filter(col("id").eq(lit(1)))?
+            .select_columns(&["bool_col", "int_col"])?;
+
+        let plan = df.explain(false, false)?.collect().await?;
+        // Filters all the way to Parquet
+        let formatted = pretty::pretty_format_batches(&plan)?.to_string();
+        assert!(formatted.contains("FilterExec: id@0 = 1"));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn write_parquet_with_compression() -> Result<()> {
+        let test_df = test_util::test_table().await?;
+
+        let output_path = "file://local/test.parquet";
+        let test_compressions = vec![
+            parquet::basic::Compression::SNAPPY,
+            parquet::basic::Compression::LZ4,
+            parquet::basic::Compression::LZ4_RAW,
+            parquet::basic::Compression::GZIP(GzipLevel::default()),
+            parquet::basic::Compression::BROTLI(BrotliLevel::default()),
+            parquet::basic::Compression::ZSTD(ZstdLevel::default()),
+        ];
+        for compression in test_compressions.into_iter() {
+            let df = test_df.clone();
+            let tmp_dir = TempDir::new()?;
+            let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
+            let local_url = Url::parse("file://local").unwrap();
+            let ctx = &test_df.session_state;
+            ctx.runtime_env().register_object_store(&local_url, local);
+            df.write_parquet(
+                output_path,
+                DataFrameWriteOptions::new().with_single_file_output(true),
+                Some(
+                    WriterProperties::builder()
+                        .set_compression(compression)
+                        .build(),
+                ),
+            )
+            .await?;
+
+            // Check that file actually used the specified compression
+            let file = std::fs::File::open(tmp_dir.into_path().join("test.parquet"))?;
+
+            let reader =
+                parquet::file::serialized_reader::SerializedFileReader::new(file)
+                    .unwrap();
+
+            let parquet_metadata = reader.metadata();
+
+            let written_compression =
+                parquet_metadata.row_group(0).column(0).compression();
+
+            assert_eq!(written_compression, compression);
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/core/src/datasource/file_format/file_compression_type.rs b/datafusion/core/src/datasource/file_format/file_compression_type.rs
index bd2868767090..3dac7c293050 100644
--- a/datafusion/core/src/datasource/file_format/file_compression_type.rs
+++ b/datafusion/core/src/datasource/file_format/file_compression_type.rs
@@ -237,7 +237,14 @@ impl FileTypeExt for FileType {
 
         match self {
             FileType::JSON | FileType::CSV => Ok(format!("{}{}", ext, c.get_ext())),
-            FileType::PARQUET | FileType::AVRO | FileType::ARROW => match c.variant {
+            FileType::AVRO | FileType::ARROW => match c.variant {
+                UNCOMPRESSED => Ok(ext),
+                _ => Err(DataFusionError::Internal(
+                    "FileCompressionType can be specified for CSV/JSON FileType.".into(),
+                )),
+            },
+            #[cfg(feature = "parquet")]
+            FileType::PARQUET => match c.variant {
                 UNCOMPRESSED => Ok(ext),
                 _ => Err(DataFusionError::Internal(
                     "FileCompressionType can be specified for CSV/JSON FileType.".into(),
@@ -276,10 +283,13 @@ mod tests {
             );
         }
 
+        let mut ty_ext_tuple = vec![];
+        ty_ext_tuple.push((FileType::AVRO, ".avro"));
+        #[cfg(feature = "parquet")]
+        ty_ext_tuple.push((FileType::PARQUET, ".parquet"));
+
         // Cannot specify compression for these file types
-        for (file_type, extension) in
-            [(FileType::AVRO, ".avro"), (FileType::PARQUET, ".parquet")]
-        {
+        for (file_type, extension) in ty_ext_tuple {
             assert_eq!(
                 file_type
                     .get_ext_with_compression(FileCompressionType::UNCOMPRESSED)
diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
index 293f062d86a9..b541e2a1d44c 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -27,6 +27,7 @@ pub mod csv;
 pub mod file_compression_type;
 pub mod json;
 pub mod options;
+#[cfg(feature = "parquet")]
 pub mod parquet;
 pub mod write;
 
diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs
index 40d9878a0134..41a70e6d2f8f 100644
--- a/datafusion/core/src/datasource/file_format/options.rs
+++ b/datafusion/core/src/datasource/file_format/options.rs
@@ -25,12 +25,12 @@ use datafusion_common::{plan_err, DataFusionError};
 
 use crate::datasource::file_format::arrow::ArrowFormat;
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
+#[cfg(feature = "parquet")]
+use crate::datasource::file_format::parquet::ParquetFormat;
 use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::listing::{ListingTableInsertMode, ListingTableUrl};
 use crate::datasource::{
-    file_format::{
-        avro::AvroFormat, csv::CsvFormat, json::JsonFormat, parquet::ParquetFormat,
-    },
+    file_format::{avro::AvroFormat, csv::CsvFormat, json::JsonFormat},
     listing::ListingOptions,
 };
 use crate::error::Result;
@@ -542,6 +542,7 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
     }
 }
 
+#[cfg(feature = "parquet")]
 #[async_trait]
 impl ReadOptions<'_> for ParquetReadOptions<'_> {
     fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions {
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index bd878932d80f..822a78a5522a 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -23,6 +23,8 @@ use std::{any::Any, sync::Arc};
 use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_files};
 use super::PartitionedFile;
 
+#[cfg(feature = "parquet")]
+use crate::datasource::file_format::parquet::ParquetFormat;
 use crate::datasource::{
     file_format::{
         arrow::ArrowFormat,
@@ -30,7 +32,6 @@ use crate::datasource::{
         csv::CsvFormat,
         file_compression_type::{FileCompressionType, FileTypeExt},
         json::JsonFormat,
-        parquet::ParquetFormat,
         FileFormat,
     },
     get_statistics_with_limit,
@@ -150,6 +151,7 @@ impl ListingTableConfig {
             FileType::JSON => Arc::new(
                 JsonFormat::default().with_file_compression_type(file_compression_type),
             ),
+            #[cfg(feature = "parquet")]
             FileType::PARQUET => Arc::new(ParquetFormat::default()),
         };
 
@@ -1019,15 +1021,15 @@ mod tests {
     use std::fs::File;
 
     use super::*;
+    #[cfg(feature = "parquet")]
+    use crate::datasource::file_format::parquet::ParquetFormat;
     use crate::datasource::{provider_as_source, MemTable};
     use crate::execution::options::ArrowReadOptions;
     use crate::physical_plan::collect;
     use crate::prelude::*;
     use crate::{
         assert_batches_eq,
-        datasource::file_format::{
-            avro::AvroFormat, file_compression_type::FileTypeExt, parquet::ParquetFormat,
-        },
+        datasource::file_format::{avro::AvroFormat, file_compression_type::FileTypeExt},
         execution::options::ReadOptions,
         logical_expr::{col, lit},
         test::{columns, object_store::register_test_store},
@@ -1090,6 +1092,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "parquet")]
     #[tokio::test]
     async fn load_table_stats_by_default() -> Result<()> {
         let testdata = crate::test_util::parquet_test_data();
@@ -1113,6 +1116,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "parquet")]
     #[tokio::test]
     async fn load_table_stats_when_no_stats() -> Result<()> {
         let testdata = crate::test_util::parquet_test_data();
@@ -1137,6 +1141,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "parquet")]
     #[tokio::test]
     async fn test_try_create_output_ordering() {
         let testdata = crate::test_util::parquet_test_data();
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs
index e74bf6fa6499..26f40518979a 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -23,10 +23,11 @@ use std::sync::Arc;
 
 use super::listing::ListingTableInsertMode;
 
+#[cfg(feature = "parquet")]
+use crate::datasource::file_format::parquet::ParquetFormat;
 use crate::datasource::file_format::{
     arrow::ArrowFormat, avro::AvroFormat, csv::CsvFormat,
-    file_compression_type::FileCompressionType, json::JsonFormat, parquet::ParquetFormat,
-    FileFormat,
+    file_compression_type::FileCompressionType, json::JsonFormat, FileFormat,
 };
 use crate::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
@@ -79,6 +80,7 @@ impl TableProviderFactory for ListingTableFactory {
                     .with_delimiter(cmd.delimiter as u8)
                     .with_file_compression_type(file_compression_type),
             ),
+            #[cfg(feature = "parquet")]
             FileType::PARQUET => Arc::new(ParquetFormat::default()),
             FileType::AVRO => Arc::new(AvroFormat),
             FileType::JSON => Arc::new(
@@ -157,6 +159,7 @@ impl TableProviderFactory for ListingTableFactory {
             Some(mode) => ListingTableInsertMode::from_str(mode.as_str()),
             None => match file_type {
                 FileType::CSV => Ok(ListingTableInsertMode::AppendToFile),
+                #[cfg(feature = "parquet")]
                 FileType::PARQUET => Ok(ListingTableInsertMode::AppendNewFiles),
                 FileType::AVRO => Ok(ListingTableInsertMode::AppendNewFiles),
                 FileType::JSON => Ok(ListingTableInsertMode::AppendToFile),
@@ -196,6 +199,7 @@ impl TableProviderFactory for ListingTableFactory {
                 json_writer_options.compression = cmd.file_compression_type;
                 FileTypeWriterOptions::JSON(json_writer_options)
             }
+            #[cfg(feature = "parquet")]
             FileType::PARQUET => file_type_writer_options,
             FileType::ARROW => file_type_writer_options,
             FileType::AVRO => file_type_writer_options,
diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs
index 455818056f2c..3ace2c239852 100644
--- a/datafusion/core/src/datasource/mod.rs
+++ b/datafusion/core/src/datasource/mod.rs
@@ -42,5 +42,6 @@ pub use self::memory::MemTable;
 pub use self::provider::TableProvider;
 pub use self::view::ViewTable;
 pub use crate::logical_expr::TableType;
+#[cfg(feature = "parquet")]
 pub(crate) use statistics::get_col_stats;
 pub use statistics::get_statistics_with_limit;
diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs
index 57844aac5181..3f84f87eb5d5 100644
--- a/datafusion/core/src/datasource/physical_plan/mod.rs
+++ b/datafusion/core/src/datasource/physical_plan/mod.rs
@@ -23,17 +23,20 @@ mod csv;
 mod file_scan_config;
 mod file_stream;
 mod json;
+#[cfg(feature = "parquet")]
 pub mod parquet;
 
 pub(crate) use self::csv::plan_to_csv;
 pub use self::csv::{CsvConfig, CsvExec, CsvOpener};
-pub(crate) use self::file_scan_config::PartitionColumnProjector;
 pub(crate) use self::json::plan_to_json;
+#[cfg(feature = "parquet")]
 pub(crate) use self::parquet::plan_to_parquet;
+#[cfg(feature = "parquet")]
 pub use self::parquet::{ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory};
 
 pub use arrow_file::ArrowExec;
 pub use avro::AvroExec;
+use file_scan_config::PartitionColumnProjector;
 pub use file_scan_config::{
     wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig,
 };
@@ -798,6 +801,7 @@ mod tests {
     }
 
     /// Unit tests for `repartition_file_groups()`
+    #[cfg(feature = "parquet")]
     mod repartition_file_groups_test {
         use datafusion_common::Statistics;
         use itertools::Itertools;
diff --git a/datafusion/core/src/execution/context/avro.rs b/datafusion/core/src/execution/context/avro.rs
new file mode 100644
index 000000000000..d60e79862ef2
--- /dev/null
+++ b/datafusion/core/src/execution/context/avro.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use super::super::options::{AvroReadOptions, ReadOptions};
+use super::{DataFilePaths, DataFrame, Result, SessionContext};
+
+impl SessionContext {
+    /// Creates a [`DataFrame`] for reading an Avro data source.
+    ///
+    /// For more control such as reading multiple files, you can use
+    /// [`read_table`](Self::read_table) with a [`super::ListingTable`].
+    ///
+    /// For an example, see [`read_csv`](Self::read_csv)
+    pub async fn read_avro<P: DataFilePaths>(
+        &self,
+        table_paths: P,
+        options: AvroReadOptions<'_>,
+    ) -> Result<DataFrame> {
+        self._read_type(table_paths, options).await
+    }
+
+    /// Registers an Avro file as a table that can be referenced from
+    /// SQL statements executed against this context.
+    pub async fn register_avro(
+        &self,
+        name: &str,
+        table_path: &str,
+        options: AvroReadOptions<'_>,
+    ) -> Result<()> {
+        let listing_options = options.to_listing_options(&self.copied_config());
+
+        self.register_listing_table(
+            name,
+            table_path,
+            listing_options,
+            options.schema.map(|s| Arc::new(s.to_owned())),
+            None,
+        )
+        .await?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use async_trait::async_trait;
+
+    // Test for compilation error when calling read_* functions from an #[async_trait] function.
+    // See https://github.com/apache/arrow-datafusion/issues/1154
+    #[async_trait]
+    trait CallReadTrait {
+        async fn call_read_avro(&self) -> DataFrame;
+    }
+
+    struct CallRead {}
+
+    #[async_trait]
+    impl CallReadTrait for CallRead {
+        async fn call_read_avro(&self) -> DataFrame {
+            let ctx = SessionContext::new();
+            ctx.read_avro("dummy", AvroReadOptions::default())
+                .await
+                .unwrap()
+        }
+    }
+}
diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs
new file mode 100644
index 000000000000..f3675422c7d5
--- /dev/null
+++ b/datafusion/core/src/execution/context/csv.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use crate::datasource::physical_plan::plan_to_csv;
+
+use super::super::options::{CsvReadOptions, ReadOptions};
+use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
+
+impl SessionContext {
+    /// Creates a [`DataFrame`] for reading a CSV data source.
+    ///
+    /// For more control such as reading multiple files, you can use
+    /// [`read_table`](Self::read_table) with a [`super::ListingTable`].
+    ///
+    /// Example usage is given below:
+    ///
+    /// ```
+    /// use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// // You can read a single file using `read_csv`
+    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// // you can also read multiple files:
+    /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn read_csv<P: DataFilePaths>(
+        &self,
+        table_paths: P,
+        options: CsvReadOptions<'_>,
+    ) -> Result<DataFrame> {
+        self._read_type(table_paths, options).await
+    }
+
+    /// Registers a CSV file as a table which can referenced from SQL
+    /// statements executed against this context.
+    pub async fn register_csv(
+        &self,
+        name: &str,
+        table_path: &str,
+        options: CsvReadOptions<'_>,
+    ) -> Result<()> {
+        let listing_options = options.to_listing_options(&self.copied_config());
+
+        self.register_listing_table(
+            name,
+            table_path,
+            listing_options,
+            options.schema.map(|s| Arc::new(s.to_owned())),
+            None,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    /// Executes a query and writes the results to a partitioned CSV file.
+    pub async fn write_csv(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        path: impl AsRef<str>,
+    ) -> Result<()> {
+        plan_to_csv(self.task_ctx(), plan, path).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::assert_batches_eq;
+    use crate::test_util::{plan_and_collect, populate_csv_partitions};
+    use async_trait::async_trait;
+    use tempfile::TempDir;
+
+    #[tokio::test]
+    async fn query_csv_with_custom_partition_extension() -> Result<()> {
+        let tmp_dir = TempDir::new()?;
+
+        // The main stipulation of this test: use a file extension that isn't .csv.
+        let file_extension = ".tst";
+
+        let ctx = SessionContext::new();
+        let schema = populate_csv_partitions(&tmp_dir, 2, file_extension)?;
+        ctx.register_csv(
+            "test",
+            tmp_dir.path().to_str().unwrap(),
+            CsvReadOptions::new()
+                .schema(&schema)
+                .file_extension(file_extension),
+        )
+        .await?;
+        let results =
+            plan_and_collect(&ctx, "SELECT SUM(c1), SUM(c2), COUNT(*) FROM test").await?;
+
+        assert_eq!(results.len(), 1);
+        let expected = [
+            "+--------------+--------------+----------+",
+            "| SUM(test.c1) | SUM(test.c2) | COUNT(*) |",
+            "+--------------+--------------+----------+",
+            "| 10           | 110          | 20       |",
+            "+--------------+--------------+----------+",
+        ];
+        assert_batches_eq!(expected, &results);
+
+        Ok(())
+    }
+
+    // Test for compilation error when calling read_* functions from an #[async_trait] function.
+    // See https://github.com/apache/arrow-datafusion/issues/1154
+    #[async_trait]
+    trait CallReadTrait {
+        async fn call_read_csv(&self) -> DataFrame;
+    }
+
+    struct CallRead {}
+
+    #[async_trait]
+    impl CallReadTrait for CallRead {
+        async fn call_read_csv(&self) -> DataFrame {
+            let ctx = SessionContext::new();
+            ctx.read_csv("dummy", CsvReadOptions::new()).await.unwrap()
+        }
+    }
+}
diff --git a/datafusion/core/src/execution/context/json.rs b/datafusion/core/src/execution/context/json.rs
new file mode 100644
index 000000000000..f67693aa8f31
--- /dev/null
+++ b/datafusion/core/src/execution/context/json.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use crate::datasource::physical_plan::plan_to_json;
+
+use super::super::options::{NdJsonReadOptions, ReadOptions};
+use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
+
+impl SessionContext {
+    /// Creates a [`DataFrame`] for reading an JSON data source.
+    ///
+    /// For more control such as reading multiple files, you can use
+    /// [`read_table`](Self::read_table) with a [`super::ListingTable`].
+    ///
+    /// For an example, see [`read_csv`](Self::read_csv)
+    pub async fn read_json<P: DataFilePaths>(
+        &self,
+        table_paths: P,
+        options: NdJsonReadOptions<'_>,
+    ) -> Result<DataFrame> {
+        self._read_type(table_paths, options).await
+    }
+
+    /// Registers a JSON file as a table that it can be referenced
+    /// from SQL statements executed against this context.
+    pub async fn register_json(
+        &self,
+        name: &str,
+        table_path: &str,
+        options: NdJsonReadOptions<'_>,
+    ) -> Result<()> {
+        let listing_options = options.to_listing_options(&self.copied_config());
+
+        self.register_listing_table(
+            name,
+            table_path,
+            listing_options,
+            options.schema.map(|s| Arc::new(s.to_owned())),
+            None,
+        )
+        .await?;
+        Ok(())
+    }
+
+    /// Executes a query and writes the results to a partitioned JSON file.
+    pub async fn write_json(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        path: impl AsRef<str>,
+    ) -> Result<()> {
+        plan_to_json(self.task_ctx(), plan, path).await
+    }
+}
diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context/mod.rs
similarity index 87%
rename from datafusion/core/src/execution/context.rs
rename to datafusion/core/src/execution/context/mod.rs
index 8bd4de742d69..d523c39ee01e 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -16,6 +16,13 @@
 // under the License.
 
 //! [`SessionContext`] contains methods for registering data sources and executing queries
+
+mod avro;
+mod csv;
+mod json;
+#[cfg(feature = "parquet")]
+mod parquet;
+
 use crate::{
     catalog::{CatalogList, MemoryCatalogList},
     datasource::{
@@ -77,7 +84,6 @@ use datafusion_sql::{
 use sqlparser::dialect::dialect_from_str;
 
 use crate::config::ConfigOptions;
-use crate::datasource::physical_plan::{plan_to_csv, plan_to_json, plan_to_parquet};
 use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry};
 use crate::physical_plan::udaf::AggregateUDF;
 use crate::physical_plan::udf::ScalarUDF;
@@ -92,7 +98,6 @@ use datafusion_sql::{
     parser::DFParser,
     planner::{ContextProvider, SqlToRel},
 };
-use parquet::file::properties::WriterProperties;
 use url::Url;
 
 use crate::catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA};
@@ -110,9 +115,7 @@ use crate::execution::options::ArrowReadOptions;
 pub use datafusion_execution::config::SessionConfig;
 pub use datafusion_execution::TaskContext;
 
-use super::options::{
-    AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions, ReadOptions,
-};
+use super::options::ReadOptions;
 
 /// DataFilePaths adds a method to convert strings and vector of strings to vector of [`ListingTableUrl`] URLs.
 /// This allows methods such [`SessionContext::read_csv`] and [`SessionContext::read_avro`]
@@ -856,34 +859,6 @@ impl SessionContext {
         self.read_table(Arc::new(provider))
     }
 
-    /// Creates a [`DataFrame`] for reading an Avro data source.
-    ///
-    /// For more control such as reading multiple files, you can use
-    /// [`read_table`](Self::read_table) with a [`ListingTable`].
-    ///
-    /// For an example, see [`read_csv`](Self::read_csv)
-    pub async fn read_avro<P: DataFilePaths>(
-        &self,
-        table_paths: P,
-        options: AvroReadOptions<'_>,
-    ) -> Result<DataFrame> {
-        self._read_type(table_paths, options).await
-    }
-
-    /// Creates a [`DataFrame`] for reading an JSON data source.
-    ///
-    /// For more control such as reading multiple files, you can use
-    /// [`read_table`](Self::read_table) with a [`ListingTable`].
-    ///
-    /// For an example, see [`read_csv`](Self::read_csv)
-    pub async fn read_json<P: DataFilePaths>(
-        &self,
-        table_paths: P,
-        options: NdJsonReadOptions<'_>,
-    ) -> Result<DataFrame> {
-        self._read_type(table_paths, options).await
-    }
-
     /// Creates a [`DataFrame`] for reading an Arrow data source.
     ///
     /// For more control such as reading multiple files, you can use
@@ -906,48 +881,6 @@ impl SessionContext {
         ))
     }
 
-    /// Creates a [`DataFrame`] for reading a CSV data source.
-    ///
-    /// For more control such as reading multiple files, you can use
-    /// [`read_table`](Self::read_table) with a [`ListingTable`].
-    ///
-    /// Example usage is given below:
-    ///
-    /// ```
-    /// use datafusion::prelude::*;
-    /// # use datafusion::error::Result;
-    /// # #[tokio::main]
-    /// # async fn main() -> Result<()> {
-    /// let ctx = SessionContext::new();
-    /// // You can read a single file using `read_csv`
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// // you can also read multiple files:
-    /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?;
-    /// # Ok(())
-    /// # }
-    /// ```
-    pub async fn read_csv<P: DataFilePaths>(
-        &self,
-        table_paths: P,
-        options: CsvReadOptions<'_>,
-    ) -> Result<DataFrame> {
-        self._read_type(table_paths, options).await
-    }
-
-    /// Creates a [`DataFrame`] for reading a Parquet data source.
-    ///
-    /// For more control such as reading multiple files, you can use
-    /// [`read_table`](Self::read_table) with a [`ListingTable`].
-    ///
-    /// For an example, see [`read_csv`](Self::read_csv)
-    pub async fn read_parquet<P: DataFilePaths>(
-        &self,
-        table_paths: P,
-        options: ParquetReadOptions<'_>,
-    ) -> Result<DataFrame> {
-        self._read_type(table_paths, options).await
-    }
-
     /// Creates a [`DataFrame`] for a [`TableProvider`] such as a
     /// [`ListingTable`] or a custom user defined provider.
     pub fn read_table(&self, provider: Arc<dyn TableProvider>) -> Result<DataFrame> {
@@ -1008,91 +941,6 @@ impl SessionContext {
         Ok(())
     }
 
-    /// Registers a CSV file as a table which can referenced from SQL
-    /// statements executed against this context.
-    pub async fn register_csv(
-        &self,
-        name: &str,
-        table_path: &str,
-        options: CsvReadOptions<'_>,
-    ) -> Result<()> {
-        let listing_options = options.to_listing_options(&self.copied_config());
-
-        self.register_listing_table(
-            name,
-            table_path,
-            listing_options,
-            options.schema.map(|s| Arc::new(s.to_owned())),
-            None,
-        )
-        .await?;
-
-        Ok(())
-    }
-
-    /// Registers a JSON file as a table that it can be referenced
-    /// from SQL statements executed against this context.
-    pub async fn register_json(
-        &self,
-        name: &str,
-        table_path: &str,
-        options: NdJsonReadOptions<'_>,
-    ) -> Result<()> {
-        let listing_options = options.to_listing_options(&self.copied_config());
-
-        self.register_listing_table(
-            name,
-            table_path,
-            listing_options,
-            options.schema.map(|s| Arc::new(s.to_owned())),
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
-    /// Registers a Parquet file as a table that can be referenced from SQL
-    /// statements executed against this context.
-    pub async fn register_parquet(
-        &self,
-        name: &str,
-        table_path: &str,
-        options: ParquetReadOptions<'_>,
-    ) -> Result<()> {
-        let listing_options = options.to_listing_options(&self.state.read().config);
-
-        self.register_listing_table(
-            name,
-            table_path,
-            listing_options,
-            options.schema.map(|s| Arc::new(s.to_owned())),
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
-    /// Registers an Avro file as a table that can be referenced from
-    /// SQL statements executed against this context.
-    pub async fn register_avro(
-        &self,
-        name: &str,
-        table_path: &str,
-        options: AvroReadOptions<'_>,
-    ) -> Result<()> {
-        let listing_options = options.to_listing_options(&self.copied_config());
-
-        self.register_listing_table(
-            name,
-            table_path,
-            listing_options,
-            options.schema.map(|s| Arc::new(s.to_owned())),
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
     /// Registers an Arrow file as a table that can be referenced from
     /// SQL statements executed against this context.
     pub async fn register_arrow(
@@ -1268,34 +1116,6 @@ impl SessionContext {
         self.state().create_physical_plan(logical_plan).await
     }
 
-    /// Executes a query and writes the results to a partitioned CSV file.
-    pub async fn write_csv(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        path: impl AsRef<str>,
-    ) -> Result<()> {
-        plan_to_csv(self.task_ctx(), plan, path).await
-    }
-
-    /// Executes a query and writes the results to a partitioned JSON file.
-    pub async fn write_json(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        path: impl AsRef<str>,
-    ) -> Result<()> {
-        plan_to_json(self.task_ctx(), plan, path).await
-    }
-
-    /// Executes a query and writes the results to a partitioned Parquet file.
-    pub async fn write_parquet(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        path: impl AsRef<str>,
-        writer_properties: Option<WriterProperties>,
-    ) -> Result<()> {
-        plan_to_parquet(self.task_ctx(), plan, path, writer_properties).await
-    }
-
     /// Get a new TaskContext to run in this session
     pub fn task_ctx(&self) -> Arc<TaskContext> {
         Arc::new(TaskContext::from(self))
@@ -1447,6 +1267,7 @@ impl SessionState {
         // Create table_factories for all default formats
         let mut table_factories: HashMap<String, Arc<dyn TableProviderFactory>> =
             HashMap::new();
+        #[cfg(feature = "parquet")]
         table_factories.insert("PARQUET".into(), Arc::new(ListingTableFactory::new()));
         table_factories.insert("CSV".into(), Arc::new(ListingTableFactory::new()));
         table_factories.insert("JSON".into(), Arc::new(ListingTableFactory::new()));
@@ -2238,22 +2059,21 @@ impl<'a> TreeNodeVisitor for BadPlanVisitor<'a> {
 
 #[cfg(test)]
 mod tests {
+    use super::super::options::CsvReadOptions;
     use super::*;
     use crate::assert_batches_eq;
     use crate::execution::context::QueryPlanner;
     use crate::execution::memory_pool::MemoryConsumer;
     use crate::execution::runtime_env::RuntimeConfig;
     use crate::test;
-    use crate::test_util::parquet_test_data;
+    use crate::test_util::{plan_and_collect, populate_csv_partitions};
     use crate::variable::VarType;
-    use arrow::record_batch::RecordBatch;
-    use arrow_schema::{Field, Schema};
+    use arrow_schema::Schema;
     use async_trait::async_trait;
     use datafusion_expr::Expr;
-    use std::fs::File;
+    use std::env;
     use std::path::PathBuf;
     use std::sync::Weak;
-    use std::{env, io::prelude::*};
     use tempfile::TempDir;
 
     #[tokio::test]
@@ -2348,39 +2168,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn query_csv_with_custom_partition_extension() -> Result<()> {
-        let tmp_dir = TempDir::new()?;
-
-        // The main stipulation of this test: use a file extension that isn't .csv.
-        let file_extension = ".tst";
-
-        let ctx = SessionContext::new();
-        let schema = populate_csv_partitions(&tmp_dir, 2, file_extension)?;
-        ctx.register_csv(
-            "test",
-            tmp_dir.path().to_str().unwrap(),
-            CsvReadOptions::new()
-                .schema(&schema)
-                .file_extension(file_extension),
-        )
-        .await?;
-        let results =
-            plan_and_collect(&ctx, "SELECT SUM(c1), SUM(c2), COUNT(*) FROM test").await?;
-
-        assert_eq!(results.len(), 1);
-        let expected = [
-            "+--------------+--------------+----------+",
-            "| SUM(test.c1) | SUM(test.c2) | COUNT(*) |",
-            "+--------------+--------------+----------+",
-            "| 10           | 110          | 20       |",
-            "+--------------+--------------+----------+",
-        ];
-        assert_batches_eq!(expected, &results);
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn send_context_to_threads() -> Result<()> {
         // ensure SessionContexts can be used in a multi-threaded
@@ -2645,60 +2432,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn read_with_glob_path() -> Result<()> {
-        let ctx = SessionContext::new();
-
-        let df = ctx
-            .read_parquet(
-                format!("{}/alltypes_plain*.parquet", parquet_test_data()),
-                ParquetReadOptions::default(),
-            )
-            .await?;
-        let results = df.collect().await?;
-        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
-        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
-        assert_eq!(total_rows, 10);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn read_with_glob_path_issue_2465() -> Result<()> {
-        let ctx = SessionContext::new();
-
-        let df = ctx
-            .read_parquet(
-                // it was reported that when a path contains // (two consecutive separator) no files were found
-                // in this test, regardless of parquet_test_data() value, our path now contains a //
-                format!("{}/..//*/alltypes_plain*.parquet", parquet_test_data()),
-                ParquetReadOptions::default(),
-            )
-            .await?;
-        let results = df.collect().await?;
-        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
-        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
-        assert_eq!(total_rows, 10);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn read_from_registered_table_with_glob_path() -> Result<()> {
-        let ctx = SessionContext::new();
-
-        ctx.register_parquet(
-            "test",
-            &format!("{}/alltypes_plain*.parquet", parquet_test_data()),
-            ParquetReadOptions::default(),
-        )
-        .await?;
-        let df = ctx.sql("SELECT * FROM test").await?;
-        let results = df.collect().await?;
-        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
-        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
-        assert_eq!(total_rows, 10);
-        Ok(())
-    }
-
     struct MyPhysicalPlanner {}
 
     #[async_trait]
@@ -2738,43 +2471,6 @@ mod tests {
         }
     }
 
-    /// Execute SQL and return results
-    async fn plan_and_collect(
-        ctx: &SessionContext,
-        sql: &str,
-    ) -> Result<Vec<RecordBatch>> {
-        ctx.sql(sql).await?.collect().await
-    }
-
-    /// Generate CSV partitions within the supplied directory
-    fn populate_csv_partitions(
-        tmp_dir: &TempDir,
-        partition_count: usize,
-        file_extension: &str,
-    ) -> Result<SchemaRef> {
-        // define schema for data source (csv file)
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::UInt32, false),
-            Field::new("c2", DataType::UInt64, false),
-            Field::new("c3", DataType::Boolean, false),
-        ]));
-
-        // generate a partitioned file
-        for partition in 0..partition_count {
-            let filename = format!("partition-{partition}.{file_extension}");
-            let file_path = tmp_dir.path().join(filename);
-            let mut file = File::create(file_path)?;
-
-            // generate some data
-            for i in 0..=10 {
-                let data = format!("{},{},{}\n", partition, i, i % 2 == 0);
-                file.write_all(data.as_bytes())?;
-            }
-        }
-
-        Ok(schema)
-    }
-
     /// Generate a partitioned CSV file and register it with an execution context
     async fn create_ctx(
         tmp_dir: &TempDir,
@@ -2796,37 +2492,4 @@ mod tests {
 
         Ok(ctx)
     }
-
-    // Test for compilation error when calling read_* functions from an #[async_trait] function.
-    // See https://github.com/apache/arrow-datafusion/issues/1154
-    #[async_trait]
-    trait CallReadTrait {
-        async fn call_read_csv(&self) -> DataFrame;
-        async fn call_read_avro(&self) -> DataFrame;
-        async fn call_read_parquet(&self) -> DataFrame;
-    }
-
-    struct CallRead {}
-
-    #[async_trait]
-    impl CallReadTrait for CallRead {
-        async fn call_read_csv(&self) -> DataFrame {
-            let ctx = SessionContext::new();
-            ctx.read_csv("dummy", CsvReadOptions::new()).await.unwrap()
-        }
-
-        async fn call_read_avro(&self) -> DataFrame {
-            let ctx = SessionContext::new();
-            ctx.read_avro("dummy", AvroReadOptions::default())
-                .await
-                .unwrap()
-        }
-
-        async fn call_read_parquet(&self) -> DataFrame {
-            let ctx = SessionContext::new();
-            ctx.read_parquet("dummy", ParquetReadOptions::default())
-                .await
-                .unwrap()
-        }
-    }
 }
diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs
new file mode 100644
index 000000000000..b02576c6a868
--- /dev/null
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -0,0 +1,154 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use parquet::file::properties::WriterProperties;
+
+use crate::datasource::physical_plan::plan_to_parquet;
+
+use super::super::options::{ParquetReadOptions, ReadOptions};
+use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
+
+impl SessionContext {
+    /// Creates a [`DataFrame`] for reading a Parquet data source.
+    ///
+    /// For more control such as reading multiple files, you can use
+    /// [`read_table`](Self::read_table) with a [`super::ListingTable`].
+    ///
+    /// For an example, see [`read_csv`](Self::read_csv)
+    pub async fn read_parquet<P: DataFilePaths>(
+        &self,
+        table_paths: P,
+        options: ParquetReadOptions<'_>,
+    ) -> Result<DataFrame> {
+        self._read_type(table_paths, options).await
+    }
+
+    /// Registers a Parquet file as a table that can be referenced from SQL
+    /// statements executed against this context.
+    pub async fn register_parquet(
+        &self,
+        name: &str,
+        table_path: &str,
+        options: ParquetReadOptions<'_>,
+    ) -> Result<()> {
+        let listing_options = options.to_listing_options(&self.state.read().config);
+
+        self.register_listing_table(
+            name,
+            table_path,
+            listing_options,
+            options.schema.map(|s| Arc::new(s.to_owned())),
+            None,
+        )
+        .await?;
+        Ok(())
+    }
+
+    /// Executes a query and writes the results to a partitioned Parquet file.
+    pub async fn write_parquet(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        path: impl AsRef<str>,
+        writer_properties: Option<WriterProperties>,
+    ) -> Result<()> {
+        plan_to_parquet(self.task_ctx(), plan, path, writer_properties).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use async_trait::async_trait;
+
+    use crate::test_util::parquet_test_data;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn read_with_glob_path() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let df = ctx
+            .read_parquet(
+                format!("{}/alltypes_plain*.parquet", parquet_test_data()),
+                ParquetReadOptions::default(),
+            )
+            .await?;
+        let results = df.collect().await?;
+        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
+        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
+        assert_eq!(total_rows, 10);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn read_with_glob_path_issue_2465() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let df = ctx
+            .read_parquet(
+                // it was reported that when a path contains // (two consecutive separator) no files were found
+                // in this test, regardless of parquet_test_data() value, our path now contains a //
+                format!("{}/..//*/alltypes_plain*.parquet", parquet_test_data()),
+                ParquetReadOptions::default(),
+            )
+            .await?;
+        let results = df.collect().await?;
+        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
+        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
+        assert_eq!(total_rows, 10);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn read_from_registered_table_with_glob_path() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        ctx.register_parquet(
+            "test",
+            &format!("{}/alltypes_plain*.parquet", parquet_test_data()),
+            ParquetReadOptions::default(),
+        )
+        .await?;
+        let df = ctx.sql("SELECT * FROM test").await?;
+        let results = df.collect().await?;
+        let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum();
+        // alltypes_plain.parquet = 8 rows, alltypes_plain.snappy.parquet = 2 rows, alltypes_dictionary.parquet = 2 rows
+        assert_eq!(total_rows, 10);
+        Ok(())
+    }
+
+    // Test for compilation error when calling read_* functions from an #[async_trait] function.
+    // See https://github.com/apache/arrow-datafusion/issues/1154
+    #[async_trait]
+    trait CallReadTrait {
+        async fn call_read_parquet(&self) -> DataFrame;
+    }
+
+    struct CallRead {}
+
+    #[async_trait]
+    impl CallReadTrait for CallRead {
+        async fn call_read_parquet(&self) -> DataFrame {
+            let ctx = SessionContext::new();
+            ctx.read_parquet("dummy", ParquetReadOptions::default())
+                .await
+                .unwrap()
+        }
+    }
+}
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 5e9f130eade5..bf9a4abf4f2d 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -437,6 +437,7 @@ pub mod variable;
 
 // re-export dependencies from arrow-rs to minimize version maintenance for crate users
 pub use arrow;
+#[cfg(feature = "parquet")]
 pub use parquet;
 
 // re-export DataFusion sub-crates at the top level. Use `pub use *`
diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index 9cd7eff4722b..d3fbc46a6659 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -26,7 +26,9 @@ use std::fmt::Formatter;
 use std::sync::Arc;
 
 use crate::config::ConfigOptions;
-use crate::datasource::physical_plan::{CsvExec, ParquetExec};
+use crate::datasource::physical_plan::CsvExec;
+#[cfg(feature = "parquet")]
+use crate::datasource::physical_plan::ParquetExec;
 use crate::error::Result;
 use crate::physical_optimizer::utils::{
     add_sort_above, get_children_exectrees, get_plan_string, is_coalesce_partitions,
@@ -1306,6 +1308,7 @@ fn ensure_distribution(
                 // When `repartition_file_scans` is set, leverage source operators
                 // (`ParquetExec`, `CsvExec` etc.) to increase parallelism at the source.
                 if repartition_file_scans {
+                    #[cfg(feature = "parquet")]
                     if let Some(parquet_exec) =
                         child.as_any().downcast_ref::<ParquetExec>()
                     {
@@ -1313,9 +1316,8 @@ fn ensure_distribution(
                             target_partitions,
                             repartition_file_min_size,
                         ));
-                    } else if let Some(csv_exec) =
-                        child.as_any().downcast_ref::<CsvExec>()
-                    {
+                    }
+                    if let Some(csv_exec) = child.as_any().downcast_ref::<CsvExec>() {
                         if let Some(csv_exec) = csv_exec.get_repartitioned(
                             target_partitions,
                             repartition_file_min_size,
@@ -1680,7 +1682,9 @@ mod tests {
     use crate::datasource::file_format::file_compression_type::FileCompressionType;
     use crate::datasource::listing::PartitionedFile;
     use crate::datasource::object_store::ObjectStoreUrl;
-    use crate::datasource::physical_plan::{FileScanConfig, ParquetExec};
+    use crate::datasource::physical_plan::FileScanConfig;
+    #[cfg(feature = "parquet")]
+    use crate::datasource::physical_plan::ParquetExec;
     use crate::physical_optimizer::enforce_sorting::EnforceSorting;
     use crate::physical_optimizer::output_requirements::OutputRequirements;
     use crate::physical_plan::aggregates::{
@@ -1819,10 +1823,12 @@ mod tests {
         ]))
     }
 
+    #[cfg(feature = "parquet")]
     fn parquet_exec() -> Arc<ParquetExec> {
         parquet_exec_with_sort(vec![])
     }
 
+    #[cfg(feature = "parquet")]
     fn parquet_exec_with_sort(
         output_ordering: Vec<Vec<PhysicalSortExpr>>,
     ) -> Arc<ParquetExec> {
@@ -1843,11 +1849,13 @@ mod tests {
         ))
     }
 
+    #[cfg(feature = "parquet")]
     fn parquet_exec_multiple() -> Arc<ParquetExec> {
         parquet_exec_multiple_sorted(vec![])
     }
 
     // Created a sorted parquet exec with multiple files
+    #[cfg(feature = "parquet")]
     fn parquet_exec_multiple_sorted(
         output_ordering: Vec<Vec<PhysicalSortExpr>>,
     ) -> Arc<ParquetExec> {
@@ -2202,6 +2210,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn multi_hash_joins() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2364,6 +2373,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn multi_joins_after_alias() -> Result<()> {
         let left = parquet_exec();
         let right = parquet_exec();
@@ -2443,6 +2453,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn multi_joins_after_multi_alias() -> Result<()> {
         let left = parquet_exec();
         let right = parquet_exec();
@@ -2498,6 +2509,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn join_after_agg_alias() -> Result<()> {
         // group by (a as a1)
         let left = aggregate_exec_with_alias(
@@ -2537,6 +2549,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn hash_join_key_ordering() -> Result<()> {
         // group by (a as a1, b as b1)
         let left = aggregate_exec_with_alias(
@@ -2589,6 +2602,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn multi_hash_join_key_ordering() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2705,6 +2719,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn reorder_join_keys_to_left_input() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2835,6 +2850,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn reorder_join_keys_to_right_input() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2960,6 +2976,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn multi_smj_joins() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -3233,6 +3250,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn smj_join_key_ordering() -> Result<()> {
         // group by (a as a1, b as b1)
         let left = aggregate_exec_with_alias(
@@ -3328,6 +3346,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn merge_does_not_need_sort() -> Result<()> {
         // see https://github.com/apache/arrow-datafusion/issues/4331
         let schema = schema();
@@ -3368,6 +3387,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn union_to_interleave() -> Result<()> {
         // group by (a as a1)
         let left = aggregate_exec_with_alias(
@@ -3409,6 +3429,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn added_repartition_to_single_partition() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(parquet_exec(), alias);
@@ -3427,6 +3448,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_deepest_node() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias);
@@ -3446,6 +3468,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_unsorted_limit() -> Result<()> {
         let plan = limit_exec(filter_exec(parquet_exec()));
 
@@ -3465,6 +3488,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_sorted_limit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3487,6 +3511,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_sorted_limit_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3512,6 +3537,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_ignores_limit() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(
@@ -3542,6 +3568,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_ignores_union() -> Result<()> {
         let plan = union_exec(vec![parquet_exec(); 5]);
 
@@ -3561,6 +3588,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_through_sort_preserving_merge() -> Result<()> {
         // sort preserving merge with non-sorted input
         let schema = schema();
@@ -3583,6 +3611,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_ignores_sort_preserving_merge() -> Result<()> {
         // sort preserving merge already sorted input,
         let schema = schema();
@@ -3614,6 +3643,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> {
         // 2 sorted parquet files unioned (partitions are concatenated, sort is preserved)
         let schema = schema();
@@ -3646,6 +3676,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_does_not_destroy_sort() -> Result<()> {
         //  SortRequired
         //    Parquet(sorted)
@@ -3671,6 +3702,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_does_not_destroy_sort_more_complex() -> Result<()> {
         // model a more complicated scenario where one child of a union can be repartitioned for performance
         // but the other can not be
@@ -3709,6 +3741,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_transitively_with_projection() -> Result<()> {
         let schema = schema();
         let proj_exprs = vec![(
@@ -3751,6 +3784,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_ignores_transitively_with_projection() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3781,6 +3815,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_transitively_past_sort_with_projection() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3810,6 +3845,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_transitively_past_sort_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3842,6 +3878,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3883,6 +3920,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_single_partition() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet = aggregate_exec_with_alias(parquet_exec(), alias.clone());
@@ -3971,6 +4009,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_two_partitions() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet =
@@ -3998,6 +4037,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_two_partitions_into_four() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet =
@@ -4025,6 +4065,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_sorted_limit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4057,6 +4098,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_limit_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4102,6 +4144,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_ignores_limit() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet = aggregate_exec_with_alias(
@@ -4152,6 +4195,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_union_inputs() -> Result<()> {
         let plan_parquet = union_exec(vec![parquet_exec(); 5]);
         let plan_csv = union_exec(vec![csv_exec(); 5]);
@@ -4181,6 +4225,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_prior_to_sort_preserving_merge() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4211,6 +4256,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_sort_preserving_merge_with_union() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4245,6 +4291,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_does_not_benefit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4273,6 +4320,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> {
         // sorted input
         let schema = schema();
@@ -4353,6 +4401,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn remove_redundant_roundrobins() -> Result<()> {
         let input = parquet_exec();
         let repartition = repartition_exec(repartition_exec(input));
@@ -4403,6 +4452,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4435,6 +4485,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition2() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4473,6 +4524,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition3() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4495,6 +4547,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn do_not_put_sort_when_input_is_invalid() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4533,6 +4586,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn put_sort_when_input_is_valid() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4575,6 +4629,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn do_not_add_unnecessary_hash() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4630,6 +4685,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn optimize_away_unnecessary_repartition() -> Result<()> {
         let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec()));
         let expected = &[
@@ -4649,6 +4705,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "parquet")]
     fn optimize_away_unnecessary_repartition2() -> Result<()> {
         let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec(
             filter_exec(repartition_exec(parquet_exec())),
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index 419f62cff664..f941e88f3a36 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -25,6 +25,7 @@ use crate::datasource::file_format::arrow::ArrowFormat;
 use crate::datasource::file_format::avro::AvroFormat;
 use crate::datasource::file_format::csv::CsvFormat;
 use crate::datasource::file_format::json::JsonFormat;
+#[cfg(feature = "parquet")]
 use crate::datasource::file_format::parquet::ParquetFormat;
 use crate::datasource::file_format::write::FileWriterMode;
 use crate::datasource::file_format::FileFormat;
@@ -599,6 +600,7 @@ impl DefaultPhysicalPlanner {
 
                     let sink_format: Arc<dyn FileFormat> = match file_format {
                         FileType::CSV => Arc::new(CsvFormat::default()),
+                        #[cfg(feature = "parquet")]
                         FileType::PARQUET => Arc::new(ParquetFormat::default()),
                         FileType::JSON => Arc::new(JsonFormat::default()),
                         FileType::AVRO => Arc::new(AvroFormat {} ),
diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs
index 4fe022f1769d..c6b43de0c18d 100644
--- a/datafusion/core/src/test_util/mod.rs
+++ b/datafusion/core/src/test_util/mod.rs
@@ -17,15 +17,21 @@
 
 //! Utility functions to make testing DataFusion based crates easier
 
+#[cfg(feature = "parquet")]
 pub mod parquet;
 
 use std::any::Any;
 use std::collections::HashMap;
+use std::fs::File;
+use std::io::Write;
 use std::path::Path;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use tempfile::TempDir;
+
+use crate::dataframe::DataFrame;
 use crate::datasource::provider::TableProviderFactory;
 use crate::datasource::{empty::EmptyTable, provider_as_source, TableProvider};
 use crate::error::Result;
@@ -48,9 +54,9 @@ use async_trait::async_trait;
 use futures::Stream;
 
 // backwards compatibility
-pub use datafusion_common::test_util::{
-    arrow_test_data, get_data_dir, parquet_test_data,
-};
+#[cfg(feature = "parquet")]
+pub use datafusion_common::test_util::parquet_test_data;
+pub use datafusion_common::test_util::{arrow_test_data, get_data_dir};
 
 pub use datafusion_common::{assert_batches_eq, assert_batches_sorted_eq};
 
@@ -102,6 +108,71 @@ pub fn aggr_test_schema() -> SchemaRef {
     Arc::new(schema)
 }
 
+/// Register session context for the aggregate_test_100.csv file
+pub async fn register_aggregate_csv(
+    ctx: &mut SessionContext,
+    table_name: &str,
+) -> Result<()> {
+    let schema = aggr_test_schema();
+    let testdata = arrow_test_data();
+    ctx.register_csv(
+        table_name,
+        &format!("{testdata}/csv/aggregate_test_100.csv"),
+        CsvReadOptions::new().schema(schema.as_ref()),
+    )
+    .await?;
+    Ok(())
+}
+
+/// Create a table from the aggregate_test_100.csv file with the specified name
+pub async fn test_table_with_name(name: &str) -> Result<DataFrame> {
+    let mut ctx = SessionContext::new();
+    register_aggregate_csv(&mut ctx, name).await?;
+    ctx.table(name).await
+}
+
+/// Create a table from the aggregate_test_100.csv file with the name "aggregate_test_100"
+pub async fn test_table() -> Result<DataFrame> {
+    test_table_with_name("aggregate_test_100").await
+}
+
+/// Execute SQL and return results
+pub async fn plan_and_collect(
+    ctx: &SessionContext,
+    sql: &str,
+) -> Result<Vec<RecordBatch>> {
+    ctx.sql(sql).await?.collect().await
+}
+
+/// Generate CSV partitions within the supplied directory
+pub fn populate_csv_partitions(
+    tmp_dir: &TempDir,
+    partition_count: usize,
+    file_extension: &str,
+) -> Result<SchemaRef> {
+    // define schema for data source (csv file)
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::UInt32, false),
+        Field::new("c2", DataType::UInt64, false),
+        Field::new("c3", DataType::Boolean, false),
+    ]));
+
+    // generate a partitioned file
+    for partition in 0..partition_count {
+        let filename = format!("partition-{partition}.{file_extension}");
+        let file_path = tmp_dir.path().join(filename);
+        let mut file = File::create(file_path)?;
+
+        // generate some data
+        for i in 0..=10 {
+            let data = format!("{},{},{}\n", partition, i, i % 2 == 0);
+            file.write_all(data.as_bytes())?;
+        }
+    }
+
+    Ok(schema)
+}
+
 /// TableFactory for tests
 pub struct TestTableFactory {}
 
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index 94e77088a7e8..32e10e58a7d7 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -36,8 +36,9 @@ name = "datafusion_proto"
 path = "src/lib.rs"
 
 [features]
-default = []
+default = ["parquet"]
 json = ["pbjson", "serde", "serde_json"]
+parquet = ["datafusion/parquet", "datafusion-common/parquet"]
 
 [dependencies]
 arrow = { workspace = true }
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index df76fbb81396..e426c598523e 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -31,11 +31,11 @@ use crate::{
 };
 
 use arrow::datatypes::{DataType, Schema, SchemaRef};
+#[cfg(feature = "parquet")]
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::{
     datasource::{
-        file_format::{
-            avro::AvroFormat, csv::CsvFormat, parquet::ParquetFormat, FileFormat,
-        },
+        file_format::{avro::AvroFormat, csv::CsvFormat, FileFormat},
         listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
         view::ViewTable,
         TableProvider,
@@ -336,6 +336,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                             "logical_plan::from_proto() Unsupported file format '{self:?}'"
                         ))
                     })? {
+                        #[cfg(feature = "parquet")]
                         &FileFormatType::Parquet(protobuf::ParquetFormat {}) => {
                             Arc::new(ParquetFormat::default())
                         }
@@ -849,28 +850,49 @@ impl AsLogicalPlan for LogicalPlanNode {
 
                 if let Some(listing_table) = source.downcast_ref::<ListingTable>() {
                     let any = listing_table.options().format.as_any();
-                    let file_format_type = if any.is::<ParquetFormat>() {
-                        FileFormatType::Parquet(protobuf::ParquetFormat {})
-                    } else if let Some(csv) = any.downcast_ref::<CsvFormat>() {
-                        FileFormatType::Csv(protobuf::CsvFormat {
-                            delimiter: byte_to_string(csv.delimiter(), "delimiter")?,
-                            has_header: csv.has_header(),
-                            quote: byte_to_string(csv.quote(), "quote")?,
-                            optional_escape: if let Some(escape) = csv.escape() {
-                                Some(protobuf::csv_format::OptionalEscape::Escape(
-                                    byte_to_string(escape, "escape")?,
-                                ))
-                            } else {
-                                None
-                            },
-                        })
-                    } else if any.is::<AvroFormat>() {
-                        FileFormatType::Avro(protobuf::AvroFormat {})
-                    } else {
-                        return Err(proto_error(format!(
+                    let file_format_type = {
+                        let mut maybe_some_type = None;
+
+                        #[cfg(feature = "parquet")]
+                        if any.is::<ParquetFormat>() {
+                            maybe_some_type =
+                                Some(FileFormatType::Parquet(protobuf::ParquetFormat {}))
+                        };
+
+                        if let Some(csv) = any.downcast_ref::<CsvFormat>() {
+                            maybe_some_type =
+                                Some(FileFormatType::Csv(protobuf::CsvFormat {
+                                    delimiter: byte_to_string(
+                                        csv.delimiter(),
+                                        "delimiter",
+                                    )?,
+                                    has_header: csv.has_header(),
+                                    quote: byte_to_string(csv.quote(), "quote")?,
+                                    optional_escape: if let Some(escape) = csv.escape() {
+                                        Some(
+                                            protobuf::csv_format::OptionalEscape::Escape(
+                                                byte_to_string(escape, "escape")?,
+                                            ),
+                                        )
+                                    } else {
+                                        None
+                                    },
+                                }))
+                        }
+
+                        if any.is::<AvroFormat>() {
+                            maybe_some_type =
+                                Some(FileFormatType::Avro(protobuf::AvroFormat {}))
+                        }
+
+                        if let Some(file_format_type) = maybe_some_type {
+                            file_format_type
+                        } else {
+                            return Err(proto_error(format!(
                             "Error converting file format, {:?} is invalid as a datafusion format.",
                             listing_table.options().format
                         )));
+                        }
                     };
 
                     let options = listing_table.options();
diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs
index ef870d8ac20b..431b8e42cdaf 100644
--- a/datafusion/proto/src/physical_plan/mod.rs
+++ b/datafusion/proto/src/physical_plan/mod.rs
@@ -22,7 +22,9 @@ use std::sync::Arc;
 use datafusion::arrow::compute::SortOptions;
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
-use datafusion::datasource::physical_plan::{AvroExec, CsvExec, ParquetExec};
+#[cfg(feature = "parquet")]
+use datafusion::datasource::physical_plan::ParquetExec;
+use datafusion::datasource::physical_plan::{AvroExec, CsvExec};
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::execution::FunctionRegistry;
 use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateMode};
@@ -171,6 +173,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 },
                 FileCompressionType::UNCOMPRESSED,
             ))),
+            #[cfg(feature = "parquet")]
             PhysicalPlanType::ParquetScan(scan) => {
                 let base_config = parse_protobuf_file_scan_config(
                     scan.base_conf.as_ref().unwrap(),
@@ -796,7 +799,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
         let plan = plan.as_any();
 
         if let Some(exec) = plan.downcast_ref::<ExplainExec>() {
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Explain(
                     protobuf::ExplainExecNode {
                         schema: Some(exec.schema().as_ref().try_into()?),
@@ -808,8 +811,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         verbose: exec.verbose(),
                     },
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<ProjectionExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<ProjectionExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -820,7 +825,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 .map(|expr| expr.0.clone().try_into())
                 .collect::<Result<Vec<_>>>()?;
             let expr_name = exec.expr().iter().map(|expr| expr.1.clone()).collect();
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Projection(Box::new(
                     protobuf::ProjectionExecNode {
                         input: Some(Box::new(input)),
@@ -828,13 +833,15 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         expr_name,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<AnalyzeExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<AnalyzeExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Analyze(Box::new(
                     protobuf::AnalyzeExecNode {
                         verbose: exec.verbose(),
@@ -843,27 +850,31 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         schema: Some(exec.schema().as_ref().try_into()?),
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<FilterExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<FilterExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Filter(Box::new(
                     protobuf::FilterExecNode {
                         input: Some(Box::new(input)),
                         expr: Some(exec.predicate().clone().try_into()?),
                     },
                 ))),
-            })
-        } else if let Some(limit) = plan.downcast_ref::<GlobalLimitExec>() {
+            });
+        }
+
+        if let Some(limit) = plan.downcast_ref::<GlobalLimitExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 limit.input().to_owned(),
                 extension_codec,
             )?;
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::GlobalLimit(Box::new(
                     protobuf::GlobalLimitExecNode {
                         input: Some(Box::new(input)),
@@ -874,21 +885,25 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         },
                     },
                 ))),
-            })
-        } else if let Some(limit) = plan.downcast_ref::<LocalLimitExec>() {
+            });
+        }
+
+        if let Some(limit) = plan.downcast_ref::<LocalLimitExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 limit.input().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::LocalLimit(Box::new(
                     protobuf::LocalLimitExecNode {
                         input: Some(Box::new(input)),
                         fetch: limit.fetch() as u32,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<HashJoinExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<HashJoinExec>() {
             let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.left().to_owned(),
                 extension_codec,
@@ -943,7 +958,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 PartitionMode::Auto => protobuf::PartitionMode::Auto,
             };
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new(
                     protobuf::HashJoinExecNode {
                         left: Some(Box::new(left)),
@@ -955,8 +970,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         filter,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<CrossJoinExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<CrossJoinExec>() {
             let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.left().to_owned(),
                 extension_codec,
@@ -965,15 +982,16 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 exec.right().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::CrossJoin(Box::new(
                     protobuf::CrossJoinExecNode {
                         left: Some(Box::new(left)),
                         right: Some(Box::new(right)),
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<AggregateExec>() {
+            });
+        }
+        if let Some(exec) = plan.downcast_ref::<AggregateExec>() {
             let groups: Vec<bool> = exec
                 .group_expr()
                 .groups()
@@ -1046,7 +1064,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 .map(|expr| expr.0.to_owned().try_into())
                 .collect::<Result<Vec<_>>>()?;
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Aggregate(Box::new(
                     protobuf::AggregateExecNode {
                         group_expr,
@@ -1062,33 +1080,38 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         groups,
                     },
                 ))),
-            })
-        } else if let Some(empty) = plan.downcast_ref::<EmptyExec>() {
+            });
+        }
+
+        if let Some(empty) = plan.downcast_ref::<EmptyExec>() {
             let schema = empty.schema().as_ref().try_into()?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Empty(
                     protobuf::EmptyExecNode {
                         produce_one_row: empty.produce_one_row(),
                         schema: Some(schema),
                     },
                 )),
-            })
-        } else if let Some(coalesce_batches) = plan.downcast_ref::<CoalesceBatchesExec>()
-        {
+            });
+        }
+
+        if let Some(coalesce_batches) = plan.downcast_ref::<CoalesceBatchesExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 coalesce_batches.input().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::CoalesceBatches(Box::new(
                     protobuf::CoalesceBatchesExecNode {
                         input: Some(Box::new(input)),
                         target_batch_size: coalesce_batches.target_batch_size() as u32,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<CsvExec>() {
-            Ok(protobuf::PhysicalPlanNode {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<CsvExec>() {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::CsvScan(
                     protobuf::CsvScanExecNode {
                         base_conf: Some(exec.base_config().try_into()?),
@@ -1104,41 +1127,50 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         },
                     },
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<ParquetExec>() {
+            });
+        }
+
+        #[cfg(feature = "parquet")]
+        if let Some(exec) = plan.downcast_ref::<ParquetExec>() {
             let predicate = exec
                 .predicate()
                 .map(|pred| pred.clone().try_into())
                 .transpose()?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::ParquetScan(
                     protobuf::ParquetScanExecNode {
                         base_conf: Some(exec.base_config().try_into()?),
                         predicate,
                     },
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<AvroExec>() {
-            Ok(protobuf::PhysicalPlanNode {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<AvroExec>() {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::AvroScan(
                     protobuf::AvroScanExecNode {
                         base_conf: Some(exec.base_config().try_into()?),
                     },
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<CoalescePartitionsExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<CoalescePartitionsExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
             )?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Merge(Box::new(
                     protobuf::CoalescePartitionsExecNode {
                         input: Some(Box::new(input)),
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<RepartitionExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<RepartitionExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -1162,15 +1194,17 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 }
             };
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Repartition(Box::new(
                     protobuf::RepartitionExecNode {
                         input: Some(Box::new(input)),
                         partition_method: Some(pb_partition_method),
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<SortExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<SortExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -1191,7 +1225,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     })
                 })
                 .collect::<Result<Vec<_>>>()?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Sort(Box::new(
                     protobuf::SortExecNode {
                         input: Some(Box::new(input)),
@@ -1203,8 +1237,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         preserve_partitioning: exec.preserve_partitioning(),
                     },
                 ))),
-            })
-        } else if let Some(union) = plan.downcast_ref::<UnionExec>() {
+            });
+        }
+
+        if let Some(union) = plan.downcast_ref::<UnionExec>() {
             let mut inputs: Vec<PhysicalPlanNode> = vec![];
             for input in union.inputs() {
                 inputs.push(protobuf::PhysicalPlanNode::try_from_physical_plan(
@@ -1212,12 +1248,14 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     extension_codec,
                 )?);
             }
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Union(
                     protobuf::UnionExecNode { inputs },
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<SortPreservingMergeExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<SortPreservingMergeExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -1238,7 +1276,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     })
                 })
                 .collect::<Result<Vec<_>>>()?;
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::SortPreservingMerge(
                     Box::new(protobuf::SortPreservingMergeExecNode {
                         input: Some(Box::new(input)),
@@ -1246,8 +1284,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         fetch: exec.fetch().map(|f| f as i64).unwrap_or(-1),
                     }),
                 )),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<NestedLoopJoinExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<NestedLoopJoinExec>() {
             let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.left().to_owned(),
                 extension_codec,
@@ -1283,7 +1323,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 })
                 .map_or(Ok(None), |v: Result<protobuf::JoinFilter>| v.map(Some))?;
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::NestedLoopJoin(Box::new(
                     protobuf::NestedLoopJoinExecNode {
                         left: Some(Box::new(left)),
@@ -1292,8 +1332,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         filter,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -1311,7 +1353,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 .map(|e| e.clone().try_into())
                 .collect::<Result<Vec<protobuf::PhysicalExprNode>>>()?;
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Window(Box::new(
                     protobuf::WindowAggExecNode {
                         input: Some(Box::new(input)),
@@ -1320,8 +1362,10 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         partition_search_mode: None,
                     },
                 ))),
-            })
-        } else if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
+            });
+        }
+
+        if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
             let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
                 exec.input().to_owned(),
                 extension_codec,
@@ -1359,7 +1403,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                 }
             };
 
-            Ok(protobuf::PhysicalPlanNode {
+            return Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::Window(Box::new(
                     protobuf::WindowAggExecNode {
                         input: Some(Box::new(input)),
@@ -1368,32 +1412,32 @@ impl AsExecutionPlan for PhysicalPlanNode {
                         partition_search_mode: Some(partition_search_mode),
                     },
                 ))),
-            })
-        } else {
-            let mut buf: Vec<u8> = vec![];
-            match extension_codec.try_encode(plan_clone.clone(), &mut buf) {
-                Ok(_) => {
-                    let inputs: Vec<protobuf::PhysicalPlanNode> = plan_clone
-                        .children()
-                        .into_iter()
-                        .map(|i| {
-                            protobuf::PhysicalPlanNode::try_from_physical_plan(
-                                i,
-                                extension_codec,
-                            )
-                        })
-                        .collect::<Result<_>>()?;
+            });
+        }
 
-                    Ok(protobuf::PhysicalPlanNode {
-                        physical_plan_type: Some(PhysicalPlanType::Extension(
-                            protobuf::PhysicalExtensionNode { node: buf, inputs },
-                        )),
+        let mut buf: Vec<u8> = vec![];
+        match extension_codec.try_encode(plan_clone.clone(), &mut buf) {
+            Ok(_) => {
+                let inputs: Vec<protobuf::PhysicalPlanNode> = plan_clone
+                    .children()
+                    .into_iter()
+                    .map(|i| {
+                        protobuf::PhysicalPlanNode::try_from_physical_plan(
+                            i,
+                            extension_codec,
+                        )
                     })
-                }
-                Err(e) => internal_err!(
-                    "Unsupported plan and extension codec failed with [{e}]. Plan: {plan_clone:?}"
-                ),
+                    .collect::<Result<_>>()?;
+
+                Ok(protobuf::PhysicalPlanNode {
+                    physical_plan_type: Some(PhysicalPlanType::Extension(
+                        protobuf::PhysicalExtensionNode { node: buf, inputs },
+                    )),
+                })
             }
+            Err(e) => internal_err!(
+                "Unsupported plan and extension codec failed with [{e}]. Plan: {plan_clone:?}"
+            ),
         }
     }
 }

From 128d7c6700bd6b8300cae86932b1e6d9bf74414d Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Thu, 26 Oct 2023 01:04:12 +0300
Subject: [PATCH 14/32] [MINOR]: Simplify enforce_distribution, minor changes
 (#7924)

* Initial commit

* Simplifications

* Cleanup imports

* Review

---------

Co-authored-by: Mehmet Ozan Kabak <ozankabak@gmail.com>
---
 .../enforce_distribution.rs                   | 196 +++++++-----------
 .../src/physical_optimizer/enforce_sorting.rs |  22 +-
 .../core/src/physical_optimizer/test_utils.rs |   3 +-
 .../core/tests/fuzz_cases/window_fuzz.rs      |  17 +-
 .../physical-expr/src/aggregate/first_last.rs |   7 +-
 datafusion/physical-expr/src/physical_expr.rs |  71 +++++--
 .../physical-expr/src/scalar_function.rs      |  21 +-
 .../physical-plan/src/aggregates/mod.rs       |   2 +-
 .../src/windows/bounded_window_agg_exec.rs    |  12 +-
 datafusion/physical-plan/src/windows/mod.rs   |  12 +-
 10 files changed, 184 insertions(+), 179 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index d3fbc46a6659..072c3cb6d7a6 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -486,7 +486,7 @@ fn reorder_aggregate_keys(
     parent_required: &[Arc<dyn PhysicalExpr>],
     agg_exec: &AggregateExec,
 ) -> Result<PlanWithKeyRequirements> {
-    let out_put_columns = agg_exec
+    let output_columns = agg_exec
         .group_by()
         .expr()
         .iter()
@@ -494,44 +494,32 @@ fn reorder_aggregate_keys(
         .map(|(index, (_col, name))| Column::new(name, index))
         .collect::<Vec<_>>();
 
-    let out_put_exprs = out_put_columns
+    let output_exprs = output_columns
         .iter()
-        .map(|c| Arc::new(c.clone()) as Arc<dyn PhysicalExpr>)
+        .map(|c| Arc::new(c.clone()) as _)
         .collect::<Vec<_>>();
 
-    if parent_required.len() != out_put_exprs.len()
+    if parent_required.len() != output_exprs.len()
         || !agg_exec.group_by().null_expr().is_empty()
-        || expr_list_eq_strict_order(&out_put_exprs, parent_required)
+        || expr_list_eq_strict_order(&output_exprs, parent_required)
     {
         Ok(PlanWithKeyRequirements::new(agg_plan))
     } else {
-        let new_positions = expected_expr_positions(&out_put_exprs, parent_required);
+        let new_positions = expected_expr_positions(&output_exprs, parent_required);
         match new_positions {
             None => Ok(PlanWithKeyRequirements::new(agg_plan)),
             Some(positions) => {
                 let new_partial_agg = if let Some(agg_exec) =
                     agg_exec.input().as_any().downcast_ref::<AggregateExec>()
-                /*AggregateExec {
-                    mode,
-                    group_by,
-                    aggr_expr,
-                    filter_expr,
-                    order_by_expr,
-                    input,
-                    input_schema,
-                    ..
-                }) =
-                */
                 {
                     if matches!(agg_exec.mode(), &AggregateMode::Partial) {
-                        let mut new_group_exprs = vec![];
-                        for idx in positions.iter() {
-                            new_group_exprs
-                                .push(agg_exec.group_by().expr()[*idx].clone());
-                        }
+                        let group_exprs = agg_exec.group_by().expr();
+                        let new_group_exprs = positions
+                            .into_iter()
+                            .map(|idx| group_exprs[idx].clone())
+                            .collect();
                         let new_partial_group_by =
                             PhysicalGroupBy::new_single(new_group_exprs);
-                        // new Partial AggregateExec
                         Some(Arc::new(AggregateExec::try_new(
                             AggregateMode::Partial,
                             new_partial_group_by,
@@ -549,18 +537,13 @@ fn reorder_aggregate_keys(
                 };
                 if let Some(partial_agg) = new_partial_agg {
                     // Build new group expressions that correspond to the output of partial_agg
-                    let new_final_group: Vec<Arc<dyn PhysicalExpr>> =
-                        partial_agg.output_group_expr();
+                    let group_exprs = partial_agg.group_expr().expr();
+                    let new_final_group = partial_agg.output_group_expr();
                     let new_group_by = PhysicalGroupBy::new_single(
                         new_final_group
                             .iter()
                             .enumerate()
-                            .map(|(i, expr)| {
-                                (
-                                    expr.clone(),
-                                    partial_agg.group_expr().expr()[i].1.clone(),
-                                )
-                            })
+                            .map(|(idx, expr)| (expr.clone(), group_exprs[idx].1.clone()))
                             .collect(),
                     );
 
@@ -575,29 +558,29 @@ fn reorder_aggregate_keys(
                     )?);
 
                     // Need to create a new projection to change the expr ordering back
-                    let mut proj_exprs = out_put_columns
+                    let agg_schema = new_final_agg.schema();
+                    let mut proj_exprs = output_columns
                         .iter()
                         .map(|col| {
+                            let name = col.name();
                             (
                                 Arc::new(Column::new(
-                                    col.name(),
-                                    new_final_agg.schema().index_of(col.name()).unwrap(),
+                                    name,
+                                    agg_schema.index_of(name).unwrap(),
                                 ))
                                     as Arc<dyn PhysicalExpr>,
-                                col.name().to_owned(),
+                                name.to_owned(),
                             )
                         })
                         .collect::<Vec<_>>();
                     let agg_schema = new_final_agg.schema();
                     let agg_fields = agg_schema.fields();
                     for (idx, field) in
-                        agg_fields.iter().enumerate().skip(out_put_columns.len())
+                        agg_fields.iter().enumerate().skip(output_columns.len())
                     {
-                        proj_exprs.push((
-                            Arc::new(Column::new(field.name().as_str(), idx))
-                                as Arc<dyn PhysicalExpr>,
-                            field.name().clone(),
-                        ))
+                        let name = field.name();
+                        proj_exprs
+                            .push((Arc::new(Column::new(name, idx)) as _, name.clone()))
                     }
                     // TODO merge adjacent Projections if there are
                     Ok(PlanWithKeyRequirements::new(Arc::new(
@@ -615,15 +598,14 @@ fn shift_right_required(
     parent_required: &[Arc<dyn PhysicalExpr>],
     left_columns_len: usize,
 ) -> Option<Vec<Arc<dyn PhysicalExpr>>> {
-    let new_right_required: Vec<Arc<dyn PhysicalExpr>> = parent_required
+    let new_right_required = parent_required
         .iter()
         .filter_map(|r| {
             if let Some(col) = r.as_any().downcast_ref::<Column>() {
-                if col.index() >= left_columns_len {
-                    Some(
-                        Arc::new(Column::new(col.name(), col.index() - left_columns_len))
-                            as Arc<dyn PhysicalExpr>,
-                    )
+                let idx = col.index();
+                if idx >= left_columns_len {
+                    let result = Column::new(col.name(), idx - left_columns_len);
+                    Some(Arc::new(result) as _)
                 } else {
                     None
                 }
@@ -634,11 +616,7 @@ fn shift_right_required(
         .collect::<Vec<_>>();
 
     // if the parent required are all comming from the right side, the requirements can be pushdown
-    if new_right_required.len() != parent_required.len() {
-        None
-    } else {
-        Some(new_right_required)
-    }
+    (new_right_required.len() == parent_required.len()).then_some(new_right_required)
 }
 
 /// When the physical planner creates the Joins, the ordering of join keys is from the original query.
@@ -662,8 +640,8 @@ fn shift_right_required(
 /// In that case, the datasources/tables might be pre-partitioned and we can't adjust the key ordering of the datasources
 /// and then can't apply the Top-Down reordering process.
 pub(crate) fn reorder_join_keys_to_inputs(
-    plan: Arc<dyn crate::physical_plan::ExecutionPlan>,
-) -> Result<Arc<dyn crate::physical_plan::ExecutionPlan>> {
+    plan: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>> {
     let plan_any = plan.as_any();
     if let Some(HashJoinExec {
         left,
@@ -676,41 +654,34 @@ pub(crate) fn reorder_join_keys_to_inputs(
         ..
     }) = plan_any.downcast_ref::<HashJoinExec>()
     {
-        match mode {
-            PartitionMode::Partitioned => {
-                let join_key_pairs = extract_join_keys(on);
-                if let Some((
-                    JoinKeyPairs {
-                        left_keys,
-                        right_keys,
-                    },
-                    new_positions,
-                )) = reorder_current_join_keys(
-                    join_key_pairs,
-                    Some(left.output_partitioning()),
-                    Some(right.output_partitioning()),
-                    &left.equivalence_properties(),
-                    &right.equivalence_properties(),
-                ) {
-                    if !new_positions.is_empty() {
-                        let new_join_on = new_join_conditions(&left_keys, &right_keys);
-                        Ok(Arc::new(HashJoinExec::try_new(
-                            left.clone(),
-                            right.clone(),
-                            new_join_on,
-                            filter.clone(),
-                            join_type,
-                            PartitionMode::Partitioned,
-                            *null_equals_null,
-                        )?))
-                    } else {
-                        Ok(plan)
-                    }
-                } else {
-                    Ok(plan)
+        if matches!(mode, PartitionMode::Partitioned) {
+            let join_key_pairs = extract_join_keys(on);
+            if let Some((
+                JoinKeyPairs {
+                    left_keys,
+                    right_keys,
+                },
+                new_positions,
+            )) = reorder_current_join_keys(
+                join_key_pairs,
+                Some(left.output_partitioning()),
+                Some(right.output_partitioning()),
+                &left.equivalence_properties(),
+                &right.equivalence_properties(),
+            ) {
+                if !new_positions.is_empty() {
+                    let new_join_on = new_join_conditions(&left_keys, &right_keys);
+                    return Ok(Arc::new(HashJoinExec::try_new(
+                        left.clone(),
+                        right.clone(),
+                        new_join_on,
+                        filter.clone(),
+                        join_type,
+                        PartitionMode::Partitioned,
+                        *null_equals_null,
+                    )?));
                 }
             }
-            _ => Ok(plan),
         }
     } else if let Some(SortMergeJoinExec {
         left,
@@ -742,23 +713,18 @@ pub(crate) fn reorder_join_keys_to_inputs(
                 for idx in 0..sort_options.len() {
                     new_sort_options.push(sort_options[new_positions[idx]])
                 }
-                Ok(Arc::new(SortMergeJoinExec::try_new(
+                return Ok(Arc::new(SortMergeJoinExec::try_new(
                     left.clone(),
                     right.clone(),
                     new_join_on,
                     *join_type,
                     new_sort_options,
                     *null_equals_null,
-                )?))
-            } else {
-                Ok(plan)
+                )?));
             }
-        } else {
-            Ok(plan)
         }
-    } else {
-        Ok(plan)
     }
+    Ok(plan)
 }
 
 /// Reorder the current join keys ordering based on either left partition or right partition
@@ -886,12 +852,7 @@ fn expected_expr_positions(
 fn extract_join_keys(on: &[(Column, Column)]) -> JoinKeyPairs {
     let (left_keys, right_keys) = on
         .iter()
-        .map(|(l, r)| {
-            (
-                Arc::new(l.clone()) as Arc<dyn PhysicalExpr>,
-                Arc::new(r.clone()) as Arc<dyn PhysicalExpr>,
-            )
-        })
+        .map(|(l, r)| (Arc::new(l.clone()) as _, Arc::new(r.clone()) as _))
         .unzip();
     JoinKeyPairs {
         left_keys,
@@ -903,7 +864,7 @@ fn new_join_conditions(
     new_left_keys: &[Arc<dyn PhysicalExpr>],
     new_right_keys: &[Arc<dyn PhysicalExpr>],
 ) -> Vec<(Column, Column)> {
-    let new_join_on = new_left_keys
+    new_left_keys
         .iter()
         .zip(new_right_keys.iter())
         .map(|(l_key, r_key)| {
@@ -912,8 +873,7 @@ fn new_join_conditions(
                 r_key.as_any().downcast_ref::<Column>().unwrap().clone(),
             )
         })
-        .collect::<Vec<_>>();
-    new_join_on
+        .collect::<Vec<_>>()
 }
 
 /// Updates `dist_onward` such that, to keep track of
@@ -977,10 +937,10 @@ fn add_roundrobin_on_top(
         // (determined by flag `config.optimizer.bounded_order_preserving_variants`)
         let should_preserve_ordering = input.output_ordering().is_some();
 
-        let new_plan = Arc::new(
-            RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(n_target))?
-                .with_preserve_order(should_preserve_ordering),
-        ) as Arc<dyn ExecutionPlan>;
+        let partitioning = Partitioning::RoundRobinBatch(n_target);
+        let repartition = RepartitionExec::try_new(input, partitioning)?
+            .with_preserve_order(should_preserve_ordering);
+        let new_plan = Arc::new(repartition) as Arc<dyn ExecutionPlan>;
 
         // update distribution onward with new operator
         update_distribution_onward(new_plan.clone(), dist_onward, input_idx);
@@ -1009,7 +969,7 @@ fn add_roundrobin_on_top(
 ///
 /// # Returns
 ///
-/// A [Result] object that contains new execution plan, where desired distribution is
+/// A [`Result`] object that contains new execution plan, where desired distribution is
 /// satisfied by adding Hash Repartition.
 fn add_hash_on_top(
     input: Arc<dyn ExecutionPlan>,
@@ -1053,10 +1013,10 @@ fn add_hash_on_top(
         } else {
             input
         };
-        new_plan = Arc::new(
-            RepartitionExec::try_new(new_plan, Partitioning::Hash(hash_exprs, n_target))?
-                .with_preserve_order(should_preserve_ordering),
-        ) as _;
+        let partitioning = Partitioning::Hash(hash_exprs, n_target);
+        let repartition = RepartitionExec::try_new(new_plan, partitioning)?
+            .with_preserve_order(should_preserve_ordering);
+        new_plan = Arc::new(repartition) as _;
 
         // update distribution onward with new operator
         update_distribution_onward(new_plan.clone(), dist_onward, input_idx);
@@ -1146,7 +1106,7 @@ fn remove_dist_changing_operators(
     {
         // All of above operators have a single child. When we remove the top
         // operator, we take the first child.
-        plan = plan.children()[0].clone();
+        plan = plan.children().swap_remove(0);
         distribution_onwards =
             get_children_exectrees(plan.children().len(), &distribution_onwards[0]);
     }
@@ -1199,14 +1159,14 @@ fn replace_order_preserving_variants_helper(
     }
     if is_sort_preserving_merge(&exec_tree.plan) {
         return Ok(Arc::new(CoalescePartitionsExec::new(
-            updated_children[0].clone(),
+            updated_children.swap_remove(0),
         )));
     }
     if let Some(repartition) = exec_tree.plan.as_any().downcast_ref::<RepartitionExec>() {
         if repartition.preserve_order() {
             return Ok(Arc::new(
                 RepartitionExec::try_new(
-                    updated_children[0].clone(),
+                    updated_children.swap_remove(0),
                     repartition.partitioning().clone(),
                 )?
                 .with_preserve_order(false),
@@ -1427,7 +1387,7 @@ fn ensure_distribution(
             //           Data
             Arc::new(InterleaveExec::try_new(new_children)?)
         } else {
-            plan.clone().with_new_children(new_children)?
+            plan.with_new_children(new_children)?
         },
         distribution_onwards,
     };
@@ -1624,7 +1584,7 @@ impl PlanWithKeyRequirements {
                 let length = child.children().len();
                 PlanWithKeyRequirements {
                     plan: child,
-                    required_key_ordering: from_parent.clone(),
+                    required_key_ordering: from_parent,
                     request_key_ordering: vec![None; length],
                 }
             })
diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 913dae07faa1..822a224d236a 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -17,8 +17,8 @@
 
 //! EnforceSorting optimizer rule inspects the physical plan with respect
 //! to local sorting requirements and does the following:
-//! - Adds a [SortExec] when a requirement is not met,
-//! - Removes an already-existing [SortExec] if it is possible to prove
+//! - Adds a [`SortExec`] when a requirement is not met,
+//! - Removes an already-existing [`SortExec`] if it is possible to prove
 //!   that this sort is unnecessary
 //! The rule can work on valid *and* invalid physical plans with respect to
 //! sorting requirements, but always produces a valid physical plan in this sense.
@@ -496,9 +496,10 @@ fn ensure_sorting(
     {
         // This SortPreservingMergeExec is unnecessary, input already has a
         // single partition.
+        sort_onwards.truncate(1);
         return Ok(Transformed::Yes(PlanWithCorrespondingSort {
-            plan: children[0].clone(),
-            sort_onwards: vec![sort_onwards[0].clone()],
+            plan: children.swap_remove(0),
+            sort_onwards,
         }));
     }
     Ok(Transformed::Yes(PlanWithCorrespondingSort {
@@ -649,7 +650,7 @@ fn remove_corresponding_coalesce_in_sub_plan(
             && is_repartition(&new_plan)
             && is_repartition(parent)
         {
-            new_plan = new_plan.children()[0].clone()
+            new_plan = new_plan.children().swap_remove(0)
         }
         new_plan
     } else {
@@ -689,7 +690,7 @@ fn remove_corresponding_sort_from_sub_plan(
 ) -> Result<Arc<dyn ExecutionPlan>> {
     // A `SortExec` is always at the bottom of the tree.
     let mut updated_plan = if is_sort(&sort_onwards.plan) {
-        sort_onwards.plan.children()[0].clone()
+        sort_onwards.plan.children().swap_remove(0)
     } else {
         let plan = &sort_onwards.plan;
         let mut children = plan.children();
@@ -703,12 +704,12 @@ fn remove_corresponding_sort_from_sub_plan(
         }
         // Replace with variants that do not preserve order.
         if is_sort_preserving_merge(plan) {
-            children[0].clone()
+            children.swap_remove(0)
         } else if let Some(repartition) = plan.as_any().downcast_ref::<RepartitionExec>()
         {
             Arc::new(
                 RepartitionExec::try_new(
-                    children[0].clone(),
+                    children.swap_remove(0),
                     repartition.partitioning().clone(),
                 )?
                 .with_preserve_order(false),
@@ -730,7 +731,7 @@ fn remove_corresponding_sort_from_sub_plan(
                 updated_plan,
             ));
         } else {
-            updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan.clone()));
+            updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan));
         }
     }
     Ok(updated_plan)
@@ -777,8 +778,7 @@ mod tests {
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::Result;
     use datafusion_expr::JoinType;
-    use datafusion_physical_expr::expressions::Column;
-    use datafusion_physical_expr::expressions::{col, NotExpr};
+    use datafusion_physical_expr::expressions::{col, Column, NotExpr};
 
     fn create_test_schema() -> Result<SchemaRef> {
         let nullable_column = Field::new("nullable_col", DataType::Int32, true);
diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs
index 53401751b67e..159ee5089075 100644
--- a/datafusion/core/src/physical_optimizer/test_utils.rs
+++ b/datafusion/core/src/physical_optimizer/test_utils.rs
@@ -44,6 +44,7 @@ use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction};
 use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
+use datafusion_physical_plan::windows::PartitionSearchMode;
 
 use async_trait::async_trait;
 
@@ -239,7 +240,7 @@ pub fn bounded_window_exec(
             .unwrap()],
             input.clone(),
             vec![],
-            crate::physical_plan::windows::PartitionSearchMode::Sorted,
+            PartitionSearchMode::Sorted,
         )
         .unwrap(),
     )
diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
index 83c8e1f57896..db940a9794a1 100644
--- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
@@ -22,9 +22,6 @@ use arrow::compute::{concat_batches, SortOptions};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use hashbrown::HashMap;
-use rand::rngs::StdRng;
-use rand::{Rng, SeedableRng};
 
 use datafusion::physical_plan::memory::MemoryExec;
 use datafusion::physical_plan::sorts::sort::SortExec;
@@ -32,22 +29,26 @@ use datafusion::physical_plan::windows::{
     create_window_expr, BoundedWindowAggExec, PartitionSearchMode, WindowAggExec,
 };
 use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::type_coercion::aggregates::coerce_types;
 use datafusion_expr::{
     AggregateFunction, BuiltInWindowFunction, WindowFrame, WindowFrameBound,
     WindowFrameUnits, WindowFunction,
 };
-
-use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::type_coercion::aggregates::coerce_types;
 use datafusion_physical_expr::expressions::{cast, col, lit};
 use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
 use test_utils::add_empty_batches;
 
+use hashbrown::HashMap;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion::physical_plan::windows::PartitionSearchMode::{
+
+    use datafusion_physical_plan::windows::PartitionSearchMode::{
         Linear, PartiallySorted, Sorted,
     };
 
diff --git a/datafusion/physical-expr/src/aggregate/first_last.rs b/datafusion/physical-expr/src/aggregate/first_last.rs
index ce7a1daeec64..a4e0a6dc49a9 100644
--- a/datafusion/physical-expr/src/aggregate/first_last.rs
+++ b/datafusion/physical-expr/src/aggregate/first_last.rs
@@ -26,12 +26,9 @@ use crate::{
     reverse_order_bys, AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr,
 };
 
-use arrow::array::ArrayRef;
-use arrow::compute;
-use arrow::compute::{lexsort_to_indices, SortColumn};
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
+use arrow::compute::{self, lexsort_to_indices, SortColumn};
 use arrow::datatypes::{DataType, Field};
-use arrow_array::cast::AsArray;
-use arrow_array::{Array, BooleanArray};
 use arrow_schema::SortOptions;
 use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx};
 use datafusion_common::{DataFusionError, Result, ScalarValue};
diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs
index 0eff45b6b9f7..11fa6c899621 100644
--- a/datafusion/physical-expr/src/physical_expr.rs
+++ b/datafusion/physical-expr/src/physical_expr.rs
@@ -15,6 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
+use std::fmt::{Debug, Display};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
 use crate::intervals::Interval;
 use crate::sort_properties::SortProperties;
 use crate::utils::scatter;
@@ -27,11 +32,6 @@ use datafusion_common::utils::DataPtr;
 use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result};
 use datafusion_expr::ColumnarValue;
 
-use std::any::Any;
-use std::fmt::{Debug, Display};
-use std::hash::{Hash, Hasher};
-use std::sync::Arc;
-
 /// Expression that can be evaluated against a RecordBatch
 /// A Physical expression knows its type, nullability and how to evaluate itself.
 pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq<dyn Any> {
@@ -54,13 +54,12 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq<dyn Any> {
         let tmp_batch = filter_record_batch(batch, selection)?;
 
         let tmp_result = self.evaluate(&tmp_batch)?;
-        // All values from the `selection` filter are true.
+
         if batch.num_rows() == tmp_batch.num_rows() {
-            return Ok(tmp_result);
-        }
-        if let ColumnarValue::Array(a) = tmp_result {
-            let result = scatter(selection, a.as_ref())?;
-            Ok(ColumnarValue::Array(result))
+            // All values from the `selection` filter are true.
+            Ok(tmp_result)
+        } else if let ColumnarValue::Array(a) = tmp_result {
+            scatter(selection, a.as_ref()).map(ColumnarValue::Array)
         } else {
             Ok(tmp_result)
         }
@@ -216,8 +215,8 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
     }
 }
 
-/// It is similar to contains method of vector.
-/// Finds whether `expr` is among `physical_exprs`.
+/// This function is similar to the `contains` method of `Vec`. It finds
+/// whether `expr` is among `physical_exprs`.
 pub fn physical_exprs_contains(
     physical_exprs: &[Arc<dyn PhysicalExpr>],
     expr: &Arc<dyn PhysicalExpr>,
@@ -226,3 +225,49 @@ pub fn physical_exprs_contains(
         .iter()
         .any(|physical_expr| physical_expr.eq(expr))
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::expressions::{Column, Literal};
+    use crate::physical_expr::{physical_exprs_contains, PhysicalExpr};
+
+    use datafusion_common::{Result, ScalarValue};
+
+    #[test]
+    fn test_physical_exprs_contains() -> Result<()> {
+        let lit_true = Arc::new(Literal::new(ScalarValue::Boolean(Some(true))))
+            as Arc<dyn PhysicalExpr>;
+        let lit_false = Arc::new(Literal::new(ScalarValue::Boolean(Some(false))))
+            as Arc<dyn PhysicalExpr>;
+        let lit4 =
+            Arc::new(Literal::new(ScalarValue::Int32(Some(4)))) as Arc<dyn PhysicalExpr>;
+        let lit2 =
+            Arc::new(Literal::new(ScalarValue::Int32(Some(2)))) as Arc<dyn PhysicalExpr>;
+        let lit1 =
+            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
+        let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
+        let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
+
+        // lit(true), lit(false), lit(4), lit(2), Col(a), Col(b)
+        let physical_exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
+            lit_true.clone(),
+            lit_false.clone(),
+            lit4.clone(),
+            lit2.clone(),
+            col_a_expr.clone(),
+            col_b_expr.clone(),
+        ];
+        // below expressions are inside physical_exprs
+        assert!(physical_exprs_contains(&physical_exprs, &lit_true));
+        assert!(physical_exprs_contains(&physical_exprs, &lit2));
+        assert!(physical_exprs_contains(&physical_exprs, &col_b_expr));
+
+        // below expressions are not inside physical_exprs
+        assert!(!physical_exprs_contains(&physical_exprs, &col_c_expr));
+        assert!(!physical_exprs_contains(&physical_exprs, &lit1));
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs
index dc48baa23ab3..43598ce56489 100644
--- a/datafusion/physical-expr/src/scalar_function.rs
+++ b/datafusion/physical-expr/src/scalar_function.rs
@@ -29,24 +29,25 @@
 //! This module also has a set of coercion rules to improve user experience: if an argument i32 is passed
 //! to a function that supports f64, it is coerced to f64.
 
+use std::any::Any;
+use std::fmt::Debug;
+use std::fmt::{self, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
 use crate::functions::out_ordering;
 use crate::physical_expr::down_cast_any_ref;
 use crate::sort_properties::SortProperties;
 use crate::utils::expr_list_eq_strict_order;
 use crate::PhysicalExpr;
+
 use arrow::datatypes::{DataType, Schema};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
-use datafusion_expr::expr_vec_fmt;
-use datafusion_expr::BuiltinScalarFunction;
-use datafusion_expr::ColumnarValue;
-use datafusion_expr::FuncMonotonicity;
-use datafusion_expr::ScalarFunctionImplementation;
-use std::any::Any;
-use std::fmt::Debug;
-use std::fmt::{self, Formatter};
-use std::hash::{Hash, Hasher};
-use std::sync::Arc;
+use datafusion_expr::{
+    expr_vec_fmt, BuiltinScalarFunction, ColumnarValue, FuncMonotonicity,
+    ScalarFunctionImplementation,
+};
 
 /// Physical expression of a scalar function
 pub struct ScalarFunctionExpr {
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index 1fa129680cea..4c612223178c 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -228,7 +228,7 @@ impl PhysicalGroupBy {
     }
 
     /// Return grouping expressions as they occur in the output schema.
-    fn output_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
+    pub fn output_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
         self.expr
             .iter()
             .enumerate()
diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
index eab47886c764..f6ffe2e26795 100644
--- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
@@ -31,6 +31,7 @@ use crate::expressions::PhysicalSortExpr;
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::windows::{
     calc_requirements, get_ordered_partition_by_indices, window_ordering_equivalence,
+    PartitionSearchMode,
 };
 use crate::{
     ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
@@ -68,17 +69,6 @@ use hashbrown::raw::RawTable;
 use indexmap::IndexMap;
 use log::debug;
 
-#[derive(Debug, Clone, PartialEq)]
-/// Specifies partition column properties in terms of input ordering
-pub enum PartitionSearchMode {
-    /// None of the columns among the partition columns is ordered.
-    Linear,
-    /// Some columns of the partition columns are ordered but not all
-    PartiallySorted(Vec<usize>),
-    /// All Partition columns are ordered (Also empty case)
-    Sorted,
-}
-
 /// Window execution plan
 #[derive(Debug)]
 pub struct BoundedWindowAggExec {
diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs
index cc915e54af60..aff936499a5e 100644
--- a/datafusion/physical-plan/src/windows/mod.rs
+++ b/datafusion/physical-plan/src/windows/mod.rs
@@ -54,13 +54,23 @@ mod bounded_window_agg_exec;
 mod window_agg_exec;
 
 pub use bounded_window_agg_exec::BoundedWindowAggExec;
-pub use bounded_window_agg_exec::PartitionSearchMode;
 pub use window_agg_exec::WindowAggExec;
 
 pub use datafusion_physical_expr::window::{
     BuiltInWindowExpr, PlainAggregateWindowExpr, WindowExpr,
 };
 
+#[derive(Debug, Clone, PartialEq)]
+/// Specifies partition column properties in terms of input ordering
+pub enum PartitionSearchMode {
+    /// None of the columns among the partition columns is ordered.
+    Linear,
+    /// Some columns of the partition columns are ordered but not all
+    PartiallySorted(Vec<usize>),
+    /// All Partition columns are ordered (Also empty case)
+    Sorted,
+}
+
 /// Create a physical expression for window function
 pub fn create_window_expr(
     fun: &WindowFunction,

From 4881b5db36a978ddd4523d5a03daf7c0938a23bd Mon Sep 17 00:00:00 2001
From: Jeffrey <22608443+Jefffrey@users.noreply.github.com>
Date: Thu, 26 Oct 2023 09:18:52 +1100
Subject: [PATCH 15/32] Add simple window query to sqllogictest (#7928)

---
 datafusion/sqllogictest/test_files/window.slt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 7226182cf3d0..213f6daaef3e 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -3378,3 +3378,10 @@ SELECT
   window1 AS (ORDER BY C3)
   ORDER BY C3
   LIMIT 5
+
+
+# simple window query
+query II
+select sum(1) over() x, sum(1) over () y
+----
+1 1

From 0911f1523ec7088bae88684ecb9bca94aa553693 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Thu, 26 Oct 2023 13:06:37 +0200
Subject: [PATCH 16/32] ci: upgrade node to version 20 (#7918)

---
 .github/workflows/dev.yml  | 2 +-
 .github/workflows/rust.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 1f5088a1e6ce..cc23e99e8cba 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -43,7 +43,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4
         with:
-          node-version: "14"
+          node-version: "20"
       - name: Prettier check
         run: |
           # if you encounter error, rerun the command below and commit the changes
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 55f6cecf54aa..6c3d60bd4399 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -527,7 +527,7 @@ jobs:
           rust-version: stable
       - uses: actions/setup-node@v4
         with:
-          node-version: "14"
+          node-version: "20"
       - name: Check if configs.md has been modified
         run: |
           # If you encounter an error, run './dev/update_config_docs.sh' and commit

From 12b473b08bfc7f64f67b977ffb2252ac09a502ab Mon Sep 17 00:00:00 2001
From: comphead <comphead@users.noreply.github.com>
Date: Thu, 26 Oct 2023 07:29:32 -0700
Subject: [PATCH 17/32] Change input for `to_timestamp` function to be seconds
 rather than nanoseconds, add `to_timestamp_nanos` (#7844)

* Change input for `to_timestamp` function

* docs

* fix examples

* output `to_timestamp` signature as ns
---
 datafusion/core/tests/sql/expr.rs             | 18 ++++----
 datafusion/core/tests/sql/timestamp.rs        |  2 +-
 datafusion/expr/src/built_in_function.rs      | 17 ++++++++
 datafusion/expr/src/expr_fn.rs                |  6 +++
 .../simplify_expressions/expr_simplifier.rs   |  2 +-
 .../physical-expr/src/datetime_expressions.rs | 11 ++++-
 datafusion/physical-expr/src/functions.rs     | 42 +++++++++++++++----
 datafusion/proto/proto/datafusion.proto       |  1 +
 datafusion/proto/src/generated/pbjson.rs      |  3 ++
 datafusion/proto/src/generated/prost.rs       |  3 ++
 .../proto/src/logical_plan/from_proto.rs      |  7 +++-
 datafusion/proto/src/logical_plan/to_proto.rs |  1 +
 .../sqllogictest/test_files/timestamps.slt    | 29 ++++++++++---
 .../source/user-guide/sql/scalar_functions.md | 20 +++++++--
 14 files changed, 130 insertions(+), 32 deletions(-)

diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 1995a040158d..7d41ad4a881c 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -639,7 +639,7 @@ async fn test_uuid_expression() -> Result<()> {
 async fn test_extract_date_part() -> Result<()> {
     test_expression!("date_part('YEAR', CAST('2000-01-01' AS DATE))", "2000.0");
     test_expression!(
-        "EXTRACT(year FROM to_timestamp('2020-09-08T12:00:00+00:00'))",
+        "EXTRACT(year FROM  timestamp '2020-09-08T12:00:00+00:00')",
         "2020.0"
     );
     test_expression!("date_part('QUARTER', CAST('2000-01-01' AS DATE))", "1.0");
@@ -686,35 +686,35 @@ async fn test_extract_date_part() -> Result<()> {
         "12.0"
     );
     test_expression!(
-        "EXTRACT(second FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12.12345678"
     );
     test_expression!(
-        "EXTRACT(millisecond FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "EXTRACT(millisecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12123.45678"
     );
     test_expression!(
-        "EXTRACT(microsecond FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12123456.78"
     );
     test_expression!(
-        "EXTRACT(nanosecond FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "1.212345678e10"
     );
     test_expression!(
-        "date_part('second', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12.12345678"
     );
     test_expression!(
-        "date_part('millisecond', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "date_part('millisecond', timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12123.45678"
     );
     test_expression!(
-        "date_part('microsecond', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "12123456.78"
     );
     test_expression!(
-        "date_part('nanosecond', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))",
+        "date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00')",
         "1.212345678e10"
     );
 
diff --git a/datafusion/core/tests/sql/timestamp.rs b/datafusion/core/tests/sql/timestamp.rs
index ada66503a181..a18e6831b615 100644
--- a/datafusion/core/tests/sql/timestamp.rs
+++ b/datafusion/core/tests/sql/timestamp.rs
@@ -742,7 +742,7 @@ async fn test_arrow_typeof() -> Result<()> {
         "+-----------------------------------------------------------------------+",
         "| arrow_typeof(date_trunc(Utf8(\"microsecond\"),to_timestamp(Int64(61)))) |",
         "+-----------------------------------------------------------------------+",
-        "| Timestamp(Nanosecond, None)                                           |",
+        "| Timestamp(Second, None)                                               |",
         "+-----------------------------------------------------------------------+",
     ];
     assert_batches_eq!(expected, &actual);
diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs
index 16554133d828..9edee7649f67 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -266,6 +266,8 @@ pub enum BuiltinScalarFunction {
     ToTimestampMillis,
     /// to_timestamp_micros
     ToTimestampMicros,
+    /// to_timestamp_nanos
+    ToTimestampNanos,
     /// to_timestamp_seconds
     ToTimestampSeconds,
     /// from_unixtime
@@ -444,6 +446,7 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::ToTimestamp => Volatility::Immutable,
             BuiltinScalarFunction::ToTimestampMillis => Volatility::Immutable,
             BuiltinScalarFunction::ToTimestampMicros => Volatility::Immutable,
+            BuiltinScalarFunction::ToTimestampNanos => Volatility::Immutable,
             BuiltinScalarFunction::ToTimestampSeconds => Volatility::Immutable,
             BuiltinScalarFunction::Translate => Volatility::Immutable,
             BuiltinScalarFunction::Trim => Volatility::Immutable,
@@ -755,6 +758,7 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::ToTimestamp => Ok(Timestamp(Nanosecond, None)),
             BuiltinScalarFunction::ToTimestampMillis => Ok(Timestamp(Millisecond, None)),
             BuiltinScalarFunction::ToTimestampMicros => Ok(Timestamp(Microsecond, None)),
+            BuiltinScalarFunction::ToTimestampNanos => Ok(Timestamp(Nanosecond, None)),
             BuiltinScalarFunction::ToTimestampSeconds => Ok(Timestamp(Second, None)),
             BuiltinScalarFunction::FromUnixtime => Ok(Timestamp(Second, None)),
             BuiltinScalarFunction::Now => {
@@ -995,6 +999,18 @@ impl BuiltinScalarFunction {
                 ],
                 self.volatility(),
             ),
+            BuiltinScalarFunction::ToTimestampNanos => Signature::uniform(
+                1,
+                vec![
+                    Int64,
+                    Timestamp(Nanosecond, None),
+                    Timestamp(Microsecond, None),
+                    Timestamp(Millisecond, None),
+                    Timestamp(Second, None),
+                    Utf8,
+                ],
+                self.volatility(),
+            ),
             BuiltinScalarFunction::ToTimestampSeconds => Signature::uniform(
                 1,
                 vec![
@@ -1431,6 +1447,7 @@ fn aliases(func: &BuiltinScalarFunction) -> &'static [&'static str] {
         BuiltinScalarFunction::ToTimestampMillis => &["to_timestamp_millis"],
         BuiltinScalarFunction::ToTimestampMicros => &["to_timestamp_micros"],
         BuiltinScalarFunction::ToTimestampSeconds => &["to_timestamp_seconds"],
+        BuiltinScalarFunction::ToTimestampNanos => &["to_timestamp_nanos"],
         BuiltinScalarFunction::FromUnixtime => &["from_unixtime"],
 
         // hashing functions
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 5368a2d8a22c..5a60c2470c95 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -834,6 +834,12 @@ scalar_expr!(
     date,
     "converts a string to a `Timestamp(Microseconds, None)`"
 );
+scalar_expr!(
+    ToTimestampNanos,
+    to_timestamp_nanos,
+    date,
+    "converts a string to a `Timestamp(Nanoseconds, None)`"
+);
 scalar_expr!(
     ToTimestampSeconds,
     to_timestamp_seconds,
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index cb3f13a51ec4..04fdcca0a994 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -1501,7 +1501,7 @@ mod tests {
         test_evaluate(expr, lit("foobarbaz"));
 
         // Check non string arguments
-        // to_timestamp("2020-09-08T12:00:00+00:00") --> timestamp(1599566400000000000i64)
+        // to_timestamp("2020-09-08T12:00:00+00:00") --> timestamp(1599566400i64)
         let expr =
             call_fn("to_timestamp", vec![lit("2020-09-08T12:00:00+00:00")]).unwrap();
         test_evaluate(expr, lit_timestamp_nano(1599566400000000000i64));
diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs
index 5cf1c21df5c2..bb8720cb8d00 100644
--- a/datafusion/physical-expr/src/datetime_expressions.rs
+++ b/datafusion/physical-expr/src/datetime_expressions.rs
@@ -154,6 +154,15 @@ pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     )
 }
 
+/// to_timestamp_nanos SQL function
+pub fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    handle::<TimestampNanosecondType, _, TimestampNanosecondType>(
+        args,
+        string_to_timestamp_nanos_shim,
+        "to_timestamp_nanos",
+    )
+}
+
 /// to_timestamp_seconds SQL function
 pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     handle::<TimestampSecondType, _, TimestampSecondType>(
@@ -962,7 +971,7 @@ mod tests {
         let mut string_builder = StringBuilder::with_capacity(2, 1024);
         let mut ts_builder = TimestampNanosecondArray::builder(2);
 
-        string_builder.append_value("2020-09-08T13:42:29.190855Z");
+        string_builder.append_value("2020-09-08T13:42:29.190855");
         ts_builder.append_value(1599572549190855000);
 
         string_builder.append_null();
diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs
index f23b45e26a03..8422862043ae 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -74,15 +74,20 @@ pub fn create_physical_expr(
         // so we don't have to pay a per-array/batch cost.
         BuiltinScalarFunction::ToTimestamp => {
             Arc::new(match input_phy_exprs[0].data_type(input_schema) {
-                Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => {
-                    |col_values: &[ColumnarValue]| {
-                        cast_column(
-                            &col_values[0],
-                            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-                            None,
-                        )
-                    }
-                }
+                Ok(DataType::Int64) => |col_values: &[ColumnarValue]| {
+                    cast_column(
+                        &col_values[0],
+                        &DataType::Timestamp(TimeUnit::Second, None),
+                        None,
+                    )
+                },
+                Ok(DataType::Timestamp(_, None)) => |col_values: &[ColumnarValue]| {
+                    cast_column(
+                        &col_values[0],
+                        &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                        None,
+                    )
+                },
                 Ok(DataType::Utf8) => datetime_expressions::to_timestamp,
                 other => {
                     return internal_err!(
@@ -129,6 +134,25 @@ pub fn create_physical_expr(
                 }
             })
         }
+        BuiltinScalarFunction::ToTimestampNanos => {
+            Arc::new(match input_phy_exprs[0].data_type(input_schema) {
+                Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => {
+                    |col_values: &[ColumnarValue]| {
+                        cast_column(
+                            &col_values[0],
+                            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                            None,
+                        )
+                    }
+                }
+                Ok(DataType::Utf8) => datetime_expressions::to_timestamp_nanos,
+                other => {
+                    return internal_err!(
+                        "Unsupported data type {other:?} for function to_timestamp_nanos"
+                    );
+                }
+            })
+        }
         BuiltinScalarFunction::ToTimestampSeconds => Arc::new({
             match input_phy_exprs[0].data_type(input_schema) {
                 Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => {
diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto
index f4ab582593e0..9b6a0448f810 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -620,6 +620,7 @@ enum ScalarFunction {
   ArrayEmpty = 115;
   ArrayPopBack = 116;
   StringToArray = 117;
+  ToTimestampNanos = 118;
 }
 
 message ScalarFunctionNode {
diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs
index e9e2fd0c0461..3eeb060f8d01 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -19772,6 +19772,7 @@ impl serde::Serialize for ScalarFunction {
             Self::ArrayEmpty => "ArrayEmpty",
             Self::ArrayPopBack => "ArrayPopBack",
             Self::StringToArray => "StringToArray",
+            Self::ToTimestampNanos => "ToTimestampNanos",
         };
         serializer.serialize_str(variant)
     }
@@ -19901,6 +19902,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
             "ArrayEmpty",
             "ArrayPopBack",
             "StringToArray",
+            "ToTimestampNanos",
         ];
 
         struct GeneratedVisitor;
@@ -20059,6 +20061,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
                     "ArrayEmpty" => Ok(ScalarFunction::ArrayEmpty),
                     "ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack),
                     "StringToArray" => Ok(ScalarFunction::StringToArray),
+                    "ToTimestampNanos" => Ok(ScalarFunction::ToTimestampNanos),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs
index 1c821708a971..d18bacfb3bcc 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2465,6 +2465,7 @@ pub enum ScalarFunction {
     ArrayEmpty = 115,
     ArrayPopBack = 116,
     StringToArray = 117,
+    ToTimestampNanos = 118,
 }
 impl ScalarFunction {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -2591,6 +2592,7 @@ impl ScalarFunction {
             ScalarFunction::ArrayEmpty => "ArrayEmpty",
             ScalarFunction::ArrayPopBack => "ArrayPopBack",
             ScalarFunction::StringToArray => "StringToArray",
+            ScalarFunction::ToTimestampNanos => "ToTimestampNanos",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
@@ -2714,6 +2716,7 @@ impl ScalarFunction {
             "ArrayEmpty" => Some(Self::ArrayEmpty),
             "ArrayPopBack" => Some(Self::ArrayPopBack),
             "StringToArray" => Some(Self::StringToArray),
+            "ToTimestampNanos" => Some(Self::ToTimestampNanos),
             _ => None,
         }
     }
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs
index c87882ca72fc..26bd0163d0a3 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -54,7 +54,8 @@ use datafusion_expr::{
     random, regexp_match, regexp_replace, repeat, replace, reverse, right, round, rpad,
     rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt,
     starts_with, strpos, substr, substring, tan, tanh, to_hex, to_timestamp_micros,
-    to_timestamp_millis, to_timestamp_seconds, translate, trim, trunc, upper, uuid,
+    to_timestamp_millis, to_timestamp_nanos, to_timestamp_seconds, translate, trim,
+    trunc, upper, uuid,
     window_frame::regularize,
     AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction,
     Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet,
@@ -521,6 +522,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction {
             ScalarFunction::Substr => Self::Substr,
             ScalarFunction::ToHex => Self::ToHex,
             ScalarFunction::ToTimestampMicros => Self::ToTimestampMicros,
+            ScalarFunction::ToTimestampNanos => Self::ToTimestampNanos,
             ScalarFunction::ToTimestampSeconds => Self::ToTimestampSeconds,
             ScalarFunction::Now => Self::Now,
             ScalarFunction::CurrentDate => Self::CurrentDate,
@@ -1592,6 +1594,9 @@ pub fn parse_expr(
                 ScalarFunction::ToTimestampMicros => {
                     Ok(to_timestamp_micros(parse_expr(&args[0], registry)?))
                 }
+                ScalarFunction::ToTimestampNanos => {
+                    Ok(to_timestamp_nanos(parse_expr(&args[0], registry)?))
+                }
                 ScalarFunction::ToTimestampSeconds => {
                     Ok(to_timestamp_seconds(parse_expr(&args[0], registry)?))
                 }
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs
index 125ced032e20..687b73cfc886 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1522,6 +1522,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction {
             BuiltinScalarFunction::Substr => Self::Substr,
             BuiltinScalarFunction::ToHex => Self::ToHex,
             BuiltinScalarFunction::ToTimestampMicros => Self::ToTimestampMicros,
+            BuiltinScalarFunction::ToTimestampNanos => Self::ToTimestampNanos,
             BuiltinScalarFunction::ToTimestampSeconds => Self::ToTimestampSeconds,
             BuiltinScalarFunction::Now => Self::Now,
             BuiltinScalarFunction::CurrentDate => Self::CurrentDate,
diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt
index fea61b076ebc..e186aa12f7a9 100644
--- a/datafusion/sqllogictest/test_files/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/timestamps.slt
@@ -217,7 +217,7 @@ SELECT to_timestamp_micros(ts) FROM ts_data_secs LIMIT 3
 
 # to nanos
 query P
-SELECT to_timestamp(ts) FROM ts_data_secs LIMIT 3
+SELECT to_timestamp_nanos(ts) FROM ts_data_secs LIMIT 3
 ----
 2020-09-08T13:42:29
 2020-09-08T12:42:29
@@ -244,7 +244,7 @@ SELECT to_timestamp_seconds(ts) FROM ts_data_micros LIMIT 3
 2020-09-08T11:42:29
 
 
-# Original column is micros, convert to nanos and check timestamp
+# Original column is micros, convert to seconds and check timestamp
 
 query P
 SELECT to_timestamp(ts) FROM ts_data_micros LIMIT 3
@@ -266,7 +266,7 @@ SELECT from_unixtime(ts / 1000000000) FROM ts_data LIMIT 3;
 # to_timestamp
 
 query I
-SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00')
+SELECT COUNT(*) FROM ts_data_nanos where ts > timestamp '2020-09-08T12:00:00+00:00'
 ----
 2
 
@@ -375,7 +375,7 @@ set datafusion.optimizer.skip_failed_rules = true
 query P
 select to_timestamp(a) from (select to_timestamp(1) as a) A;
 ----
-1970-01-01T00:00:00.000000001
+1970-01-01T00:00:01
 
 # cast_to_timestamp_seconds_twice
 query P
@@ -383,7 +383,6 @@ select to_timestamp_seconds(a) from (select to_timestamp_seconds(1) as a)A
 ----
 1970-01-01T00:00:01
 
-
 # cast_to_timestamp_millis_twice
 query P
 select to_timestamp_millis(a) from (select to_timestamp_millis(1) as a)A;
@@ -396,11 +395,17 @@ select to_timestamp_micros(a) from (select to_timestamp_micros(1) as a)A;
 ----
 1970-01-01T00:00:00.000001
 
+# cast_to_timestamp_nanos_twice
+query P
+select to_timestamp_nanos(a) from (select to_timestamp_nanos(1) as a)A;
+----
+1970-01-01T00:00:00.000000001
+
 # to_timestamp_i32
 query P
 select to_timestamp(cast (1 as int));
 ----
-1970-01-01T00:00:00.000000001
+1970-01-01T00:00:01
 
 # to_timestamp_micros_i32
 query P
@@ -408,6 +413,12 @@ select to_timestamp_micros(cast (1 as int));
 ----
 1970-01-01T00:00:00.000001
 
+# to_timestamp_nanos_i32
+query P
+select to_timestamp_nanos(cast (1 as int));
+----
+1970-01-01T00:00:00.000000001
+
 # to_timestamp_millis_i32
 query P
 select to_timestamp_millis(cast (1 as int));
@@ -1776,3 +1787,9 @@ query B
 SELECT TIMESTAMPTZ '2020-01-01 00:00:00Z' = TIMESTAMP '2020-01-01'
 ----
 true
+
+# verify to_timestamp edge cases to be in sync with postgresql
+query PPPPP
+SELECT to_timestamp(null), to_timestamp(-62125747200), to_timestamp(0), to_timestamp(1926632005177), to_timestamp(1926632005)
+----
+NULL 0001-04-25T00:00:00 1970-01-01T00:00:00 +63022-07-16T12:59:37 2031-01-19T23:33:25
diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
index d5717b9c2130..b7426baea3da 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1218,6 +1218,7 @@ regexp_replace(str, regexp, replacement, flags)
 - [to_timestamp_millis](#to_timestamp_millis)
 - [to_timestamp_micros](#to_timestamp_micros)
 - [to_timestamp_seconds](#to_timestamp_seconds)
+- [to_timestamp_nanos](#to_timestamp_nanos)
 - [from_unixtime](#from_unixtime)
 
 ### `now`
@@ -1390,10 +1391,10 @@ extract(field FROM source)
 
 ### `to_timestamp`
 
-Converts a value to RFC3339 nanosecond timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`).
+Converts a value to RFC3339 nanosecond timestamp format (`YYYY-MM-DDT00:00:00Z`).
 Supports timestamp, integer, and unsigned integer types as input.
-Integers and unsigned integers are parsed as Unix nanosecond timestamps and
-return the corresponding RFC3339 nanosecond timestamp.
+Integers and unsigned integers are parsed as Unix second timestamps and
+return the corresponding RFC3339 timestamp.
 
 ```
 to_timestamp(expression)
@@ -1428,7 +1429,18 @@ Integers and unsigned integers are parsed as Unix nanosecond timestamps and
 return the corresponding RFC3339 timestamp.
 
 ```
-to_timestamp_micros(expression)
+to_timestamp_nanos(expression)
+```
+
+### `to_timestamp_nanos`
+
+Converts a value to RFC3339 nanosecond timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`).
+Supports timestamp, integer, and unsigned integer types as input.
+Integers and unsigned integers are parsed as Unix nanosecond timestamps and
+return the corresponding RFC3339 timestamp.
+
+```
+to_timestamp_nanos(expression)
 ```
 
 #### Arguments

From 6e87f5958fe0f1f562bac49bacee8456e747de25 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 26 Oct 2023 10:31:01 -0400
Subject: [PATCH 18/32] Minor: Document `parquet` crate feature (#7927)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 2f10812f9a59..1997a6f73dd5 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ Default features:
 - `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd`
 - `crypto_expressions`: cryptographic functions such as `md5` and `sha256`
 - `encoding_expressions`: `encode` and `decode` functions
+- `parquet`: support for reading the [Apache Parquet] format
 - `regex_expressions`: regular expression functions, such as `regexp_match`
 - `unicode_expressions`: Include unicode aware functions such as `character_length`
 
@@ -59,6 +60,7 @@ Optional features:
 - `simd`: enable arrow-rs's manual `SIMD` kernels (requires Rust `nightly`)
 
 [apache avro]: https://avro.apache.org/
+[apache parquet]: https://parquet.apache.org/
 
 ## Rust Version Compatibility
 

From a892300a5a56c97b5b4ddc9aa4a421aaf412d0fe Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 26 Oct 2023 10:32:13 -0400
Subject: [PATCH 19/32] Minor: reduce some #cfg(feature = "parquet") (#7929)

---
 datafusion/core/src/datasource/file_format/parquet.rs | 4 +---
 datafusion/core/src/datasource/mod.rs                 | 2 --
 datafusion/core/src/datasource/physical_plan/mod.rs   | 2 --
 datafusion/core/src/execution/context/parquet.rs      | 3 +--
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index 62867c0e2b38..2cba474e559e 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -36,7 +36,7 @@ use tokio::sync::mpsc::{self, Receiver, Sender};
 use tokio::task::{JoinHandle, JoinSet};
 
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
-use crate::datasource::statistics::create_max_min_accs;
+use crate::datasource::statistics::{create_max_min_accs, get_col_stats};
 use arrow::datatypes::SchemaRef;
 use arrow::datatypes::{Fields, Schema};
 use bytes::{BufMut, BytesMut};
@@ -63,11 +63,9 @@ use crate::arrow::array::{
 use crate::arrow::datatypes::DataType;
 use crate::config::ConfigOptions;
 
-use crate::datasource::get_col_stats;
 use crate::datasource::physical_plan::{
     FileGroupDisplay, FileMeta, FileSinkConfig, ParquetExec, SchemaAdapter,
 };
-
 use crate::error::Result;
 use crate::execution::context::SessionState;
 use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs
index 3ace2c239852..48e9d6992124 100644
--- a/datafusion/core/src/datasource/mod.rs
+++ b/datafusion/core/src/datasource/mod.rs
@@ -42,6 +42,4 @@ pub use self::memory::MemTable;
 pub use self::provider::TableProvider;
 pub use self::view::ViewTable;
 pub use crate::logical_expr::TableType;
-#[cfg(feature = "parquet")]
-pub(crate) use statistics::get_col_stats;
 pub use statistics::get_statistics_with_limit;
diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs
index 3f84f87eb5d5..6643e4127dbd 100644
--- a/datafusion/core/src/datasource/physical_plan/mod.rs
+++ b/datafusion/core/src/datasource/physical_plan/mod.rs
@@ -30,8 +30,6 @@ pub(crate) use self::csv::plan_to_csv;
 pub use self::csv::{CsvConfig, CsvExec, CsvOpener};
 pub(crate) use self::json::plan_to_json;
 #[cfg(feature = "parquet")]
-pub(crate) use self::parquet::plan_to_parquet;
-#[cfg(feature = "parquet")]
 pub use self::parquet::{ParquetExec, ParquetFileMetrics, ParquetFileReaderFactory};
 
 pub use arrow_file::ArrowExec;
diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs
index b02576c6a868..dc202b9903f5 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -17,10 +17,9 @@
 
 use std::sync::Arc;
 
+use crate::datasource::physical_plan::parquet::plan_to_parquet;
 use parquet::file::properties::WriterProperties;
 
-use crate::datasource::physical_plan::plan_to_parquet;
-
 use super::super::options::{ParquetReadOptions, ReadOptions};
 use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
 

From ae9a446c41dfa515eb454561c160ad9aa26a7117 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 26 Oct 2023 10:33:09 -0400
Subject: [PATCH 20/32] Minor: reduce use of cfg(parquet) in tests (#7930)

---
 .../enforce_distribution.rs                   | 56 +------------------
 1 file changed, 3 insertions(+), 53 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index 072c3cb6d7a6..7b91dce32aa9 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -1634,6 +1634,8 @@ impl TreeNode for PlanWithKeyRequirements {
     }
 }
 
+/// Since almost all of these tests explicitly use `ParquetExec` they only run with the parquet  feature flag on
+#[cfg(feature = "parquet")]
 #[cfg(test)]
 mod tests {
     use std::ops::Deref;
@@ -1643,7 +1645,6 @@ mod tests {
     use crate::datasource::listing::PartitionedFile;
     use crate::datasource::object_store::ObjectStoreUrl;
     use crate::datasource::physical_plan::FileScanConfig;
-    #[cfg(feature = "parquet")]
     use crate::datasource::physical_plan::ParquetExec;
     use crate::physical_optimizer::enforce_sorting::EnforceSorting;
     use crate::physical_optimizer::output_requirements::OutputRequirements;
@@ -1783,12 +1784,10 @@ mod tests {
         ]))
     }
 
-    #[cfg(feature = "parquet")]
     fn parquet_exec() -> Arc<ParquetExec> {
         parquet_exec_with_sort(vec![])
     }
 
-    #[cfg(feature = "parquet")]
     fn parquet_exec_with_sort(
         output_ordering: Vec<Vec<PhysicalSortExpr>>,
     ) -> Arc<ParquetExec> {
@@ -1809,13 +1808,11 @@ mod tests {
         ))
     }
 
-    #[cfg(feature = "parquet")]
     fn parquet_exec_multiple() -> Arc<ParquetExec> {
         parquet_exec_multiple_sorted(vec![])
     }
 
     // Created a sorted parquet exec with multiple files
-    #[cfg(feature = "parquet")]
     fn parquet_exec_multiple_sorted(
         output_ordering: Vec<Vec<PhysicalSortExpr>>,
     ) -> Arc<ParquetExec> {
@@ -2170,7 +2167,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn multi_hash_joins() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2333,7 +2329,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn multi_joins_after_alias() -> Result<()> {
         let left = parquet_exec();
         let right = parquet_exec();
@@ -2413,7 +2408,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn multi_joins_after_multi_alias() -> Result<()> {
         let left = parquet_exec();
         let right = parquet_exec();
@@ -2469,7 +2463,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn join_after_agg_alias() -> Result<()> {
         // group by (a as a1)
         let left = aggregate_exec_with_alias(
@@ -2509,7 +2502,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn hash_join_key_ordering() -> Result<()> {
         // group by (a as a1, b as b1)
         let left = aggregate_exec_with_alias(
@@ -2562,7 +2554,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn multi_hash_join_key_ordering() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2679,7 +2670,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn reorder_join_keys_to_left_input() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2810,7 +2800,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn reorder_join_keys_to_right_input() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -2936,7 +2925,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn multi_smj_joins() -> Result<()> {
         let left = parquet_exec();
         let alias_pairs: Vec<(String, String)> = vec![
@@ -3210,7 +3198,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn smj_join_key_ordering() -> Result<()> {
         // group by (a as a1, b as b1)
         let left = aggregate_exec_with_alias(
@@ -3306,7 +3293,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn merge_does_not_need_sort() -> Result<()> {
         // see https://github.com/apache/arrow-datafusion/issues/4331
         let schema = schema();
@@ -3347,7 +3333,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn union_to_interleave() -> Result<()> {
         // group by (a as a1)
         let left = aggregate_exec_with_alias(
@@ -3389,7 +3374,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn added_repartition_to_single_partition() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(parquet_exec(), alias);
@@ -3408,7 +3392,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_deepest_node() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias);
@@ -3428,7 +3411,7 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
+
     fn repartition_unsorted_limit() -> Result<()> {
         let plan = limit_exec(filter_exec(parquet_exec()));
 
@@ -3448,7 +3431,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_sorted_limit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3471,7 +3453,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_sorted_limit_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3497,7 +3478,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_ignores_limit() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan = aggregate_exec_with_alias(
@@ -3528,7 +3508,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_ignores_union() -> Result<()> {
         let plan = union_exec(vec![parquet_exec(); 5]);
 
@@ -3548,7 +3527,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_through_sort_preserving_merge() -> Result<()> {
         // sort preserving merge with non-sorted input
         let schema = schema();
@@ -3571,7 +3549,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_ignores_sort_preserving_merge() -> Result<()> {
         // sort preserving merge already sorted input,
         let schema = schema();
@@ -3603,7 +3580,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> {
         // 2 sorted parquet files unioned (partitions are concatenated, sort is preserved)
         let schema = schema();
@@ -3636,7 +3612,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_does_not_destroy_sort() -> Result<()> {
         //  SortRequired
         //    Parquet(sorted)
@@ -3662,7 +3637,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_does_not_destroy_sort_more_complex() -> Result<()> {
         // model a more complicated scenario where one child of a union can be repartitioned for performance
         // but the other can not be
@@ -3701,7 +3675,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_transitively_with_projection() -> Result<()> {
         let schema = schema();
         let proj_exprs = vec![(
@@ -3744,7 +3717,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_ignores_transitively_with_projection() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3775,7 +3747,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_transitively_past_sort_with_projection() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3805,7 +3776,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn repartition_transitively_past_sort_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -3880,7 +3850,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_single_partition() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet = aggregate_exec_with_alias(parquet_exec(), alias.clone());
@@ -3969,7 +3938,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_two_partitions() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet =
@@ -3997,7 +3965,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_two_partitions_into_four() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet =
@@ -4025,7 +3992,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_sorted_limit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4058,7 +4024,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_limit_with_filter() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4104,7 +4069,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_ignores_limit() -> Result<()> {
         let alias = vec![("a".to_string(), "a".to_string())];
         let plan_parquet = aggregate_exec_with_alias(
@@ -4155,7 +4119,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_union_inputs() -> Result<()> {
         let plan_parquet = union_exec(vec![parquet_exec(); 5]);
         let plan_csv = union_exec(vec![csv_exec(); 5]);
@@ -4185,7 +4148,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_prior_to_sort_preserving_merge() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4216,7 +4178,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_sort_preserving_merge_with_union() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4251,7 +4212,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_does_not_benefit() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4280,7 +4240,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> {
         // sorted input
         let schema = schema();
@@ -4361,7 +4320,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn remove_redundant_roundrobins() -> Result<()> {
         let input = parquet_exec();
         let repartition = repartition_exec(repartition_exec(input));
@@ -4412,7 +4370,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4445,7 +4402,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition2() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4484,7 +4440,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn do_not_preserve_ordering_through_repartition3() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4507,7 +4462,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn do_not_put_sort_when_input_is_invalid() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4546,7 +4500,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn put_sort_when_input_is_valid() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4589,7 +4542,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn do_not_add_unnecessary_hash() -> Result<()> {
         let schema = schema();
         let sort_key = vec![PhysicalSortExpr {
@@ -4645,7 +4597,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn optimize_away_unnecessary_repartition() -> Result<()> {
         let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec()));
         let expected = &[
@@ -4665,7 +4616,6 @@ mod tests {
     }
 
     #[test]
-    #[cfg(feature = "parquet")]
     fn optimize_away_unnecessary_repartition2() -> Result<()> {
         let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec(
             filter_exec(repartition_exec(parquet_exec())),

From 30e5f4286dffaed3533a5dc6348dd91e98cf461a Mon Sep 17 00:00:00 2001
From: comphead <comphead@users.noreply.github.com>
Date: Thu, 26 Oct 2023 12:11:25 -0700
Subject: [PATCH 21/32] Fix CI failures on `to_timestamp()` calls (#7941)

* Change input for `to_timestamp` function

* docs

* fix examples

* output `to_timestamp` signature as ns

* Fix CI `to_timestamp()` failed

* Update datafusion/expr/src/built_in_function.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* fix typo

* fix

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/expr/src/built_in_function.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs
index 9edee7649f67..4db565abfcf7 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -755,7 +755,10 @@ impl BuiltinScalarFunction {
                     return plan_err!("The to_hex function can only accept integers.");
                 }
             }),
-            BuiltinScalarFunction::ToTimestamp => Ok(Timestamp(Nanosecond, None)),
+            BuiltinScalarFunction::ToTimestamp => Ok(match &input_expr_types[0] {
+                Int64 => Timestamp(Second, None),
+                _ => Timestamp(Nanosecond, None),
+            }),
             BuiltinScalarFunction::ToTimestampMillis => Ok(Timestamp(Millisecond, None)),
             BuiltinScalarFunction::ToTimestampMicros => Ok(Timestamp(Microsecond, None)),
             BuiltinScalarFunction::ToTimestampNanos => Ok(Timestamp(Nanosecond, None)),

From a9d66e2b492843c2fb335a7dfe27fed073629b09 Mon Sep 17 00:00:00 2001
From: Jonah Gao <jonahgaox@gmail.com>
Date: Fri, 27 Oct 2023 03:14:19 +0800
Subject: [PATCH 22/32] minor: add a datatype casting for the updated value
 (#7922)

* minor: cast the updated value to the data type of target column

* Update datafusion/sqllogictest/test_files/update.slt

Co-authored-by: Alex Huang <huangweijun1001@gmail.com>

* Update datafusion/sqllogictest/test_files/update.slt

Co-authored-by: Alex Huang <huangweijun1001@gmail.com>

* Update datafusion/sqllogictest/test_files/update.slt

Co-authored-by: Alex Huang <huangweijun1001@gmail.com>

* fix tests

---------

Co-authored-by: Alex Huang <huangweijun1001@gmail.com>
---
 datafusion/sql/src/statement.rs               | 41 ++++++++----------
 datafusion/sqllogictest/test_files/update.slt | 43 +++++++++++++++++++
 2 files changed, 61 insertions(+), 23 deletions(-)
 create mode 100644 datafusion/sqllogictest/test_files/update.slt

diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index f8504a487a66..b5196c086638 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -31,9 +31,9 @@ use arrow_schema::DataType;
 use datafusion_common::file_options::StatementOptions;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
-    not_impl_err, plan_datafusion_err, plan_err, unqualified_field_not_found, Column,
-    Constraints, DFField, DFSchema, DFSchemaRef, DataFusionError, ExprSchema,
-    OwnedTableReference, Result, SchemaReference, TableReference, ToDFSchema,
+    not_impl_err, plan_datafusion_err, plan_err, unqualified_field_not_found,
+    Constraints, DFField, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference,
+    Result, SchemaReference, TableReference, ToDFSchema,
 };
 use datafusion_expr::dml::{CopyOptions, CopyTo};
 use datafusion_expr::expr::Placeholder;
@@ -969,12 +969,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             table_name.clone(),
             &arrow_schema,
         )?);
-        let values = table_schema.fields().iter().map(|f| {
-            (
-                f.name().clone(),
-                ast::Expr::Identifier(ast::Ident::from(f.name().as_str())),
-            )
-        });
 
         // Overwrite with assignment expressions
         let mut planner_context = PlannerContext::new();
@@ -992,11 +986,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             })
             .collect::<Result<HashMap<String, Expr>>>()?;
 
-        let values = values
-            .into_iter()
-            .map(|(k, v)| {
-                let val = assign_map.remove(&k).unwrap_or(v);
-                (k, val)
+        let values_and_types = table_schema
+            .fields()
+            .iter()
+            .map(|f| {
+                let col_name = f.name();
+                let val = assign_map.remove(col_name).unwrap_or_else(|| {
+                    ast::Expr::Identifier(ast::Ident::from(col_name.as_str()))
+                });
+                (col_name, val, f.data_type())
             })
             .collect::<Vec<_>>();
 
@@ -1026,25 +1024,22 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
 
         // Projection
         let mut exprs = vec![];
-        for (col_name, expr) in values.into_iter() {
+        for (col_name, expr, dt) in values_and_types.into_iter() {
             let expr = self.sql_to_expr(expr, &table_schema, &mut planner_context)?;
             let expr = match expr {
                 datafusion_expr::Expr::Placeholder(Placeholder {
                     ref id,
                     ref data_type,
                 }) => match data_type {
-                    None => {
-                        let dt = table_schema.data_type(&Column::from_name(&col_name))?;
-                        datafusion_expr::Expr::Placeholder(Placeholder::new(
-                            id.clone(),
-                            Some(dt.clone()),
-                        ))
-                    }
+                    None => datafusion_expr::Expr::Placeholder(Placeholder::new(
+                        id.clone(),
+                        Some(dt.clone()),
+                    )),
                     Some(_) => expr,
                 },
                 _ => expr,
             };
-            let expr = expr.alias(col_name);
+            let expr = expr.cast_to(dt, source.schema())?.alias(col_name);
             exprs.push(expr);
         }
         let source = project(source, exprs)?;
diff --git a/datafusion/sqllogictest/test_files/update.slt b/datafusion/sqllogictest/test_files/update.slt
new file mode 100644
index 000000000000..4542a262390c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/update.slt
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Update Tests
+##########
+
+statement ok
+create table t1(a int, b varchar, c double, d int);
+
+# Turn off the optimizer to make the logical plan closer to the initial one
+statement ok
+set datafusion.optimizer.max_passes = 0;
+
+query TT
+explain update t1 set a=1, b=2, c=3.0, d=NULL;
+----
+logical_plan
+Dml: op=[Update] table=[t1]
+--Projection: CAST(Int64(1) AS Int32) AS a, CAST(Int64(2) AS Utf8) AS b, Float64(3) AS c, CAST(NULL AS Int32) AS d
+----TableScan: t1
+
+query TT
+explain update t1 set a=c+1, b=a, c=c+1.0, d=b;
+----
+logical_plan
+Dml: op=[Update] table=[t1]
+--Projection: CAST(t1.c + CAST(Int64(1) AS Float64) AS Int32) AS a, CAST(t1.a AS Utf8) AS b, t1.c + Float64(1) AS c,  CAST(t1.b AS Int32) AS d
+----TableScan: t1

From 74fc6f8c8e113d4d23b11a8969c92c956629c5a6 Mon Sep 17 00:00:00 2001
From: Huaijin <haohuaijin@gmail.com>
Date: Sat, 28 Oct 2023 05:06:38 +0800
Subject: [PATCH 23/32] fix (#7946)

---
 datafusion-examples/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index e5146c7fd94e..8d504f834bc5 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -36,7 +36,7 @@ arrow-schema = { workspace = true }
 async-trait = "0.1.41"
 bytes = "1.4"
 dashmap = "5.4"
-datafusion = { path = "../datafusion/core" }
+datafusion = { path = "../datafusion/core", features = ["avro"] }
 datafusion-common = { path = "../datafusion/common" }
 datafusion-expr = { path = "../datafusion/expr" }
 datafusion-optimizer = { path = "../datafusion/optimizer" }

From 46ae9a4756aaa56440748c0c529fa312aca23e96 Mon Sep 17 00:00:00 2001
From: Jeffrey <22608443+Jefffrey@users.noreply.github.com>
Date: Sat, 28 Oct 2023 20:40:13 +1100
Subject: [PATCH 24/32] Add simple exclude all columns test to sqllogictest
 (#7945)

* Add simple exclude all columns test to sqllogictest

* Add more exclude test cases
---
 datafusion/sqllogictest/test_files/select.slt | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index 1d427479763a..98ea061c731b 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -848,6 +848,26 @@ statement error DataFusion error: Error during planning: EXCLUDE or EXCEPT conta
 SELECT * EXCLUDE(a, a)
 FROM table1
 
+# if EXCEPT all the columns, query should still succeed but return empty
+statement ok
+SELECT * EXCEPT(a, b, c, d)
+FROM table1
+
+# EXCLUDE order shouldn't matter
+query II
+SELECT * EXCLUDE(b, a)
+FROM table1
+ORDER BY c
+LIMIT 5
+----
+100 1000
+200 2000
+
+# EXCLUDE with out of order but duplicate columns should error
+statement error DataFusion error: Error during planning: EXCLUDE or EXCEPT contains duplicate column names
+SELECT * EXCLUDE(d, b, c, a, a, b, c, d)
+FROM table1
+
 # run below query in multi partitions
 statement ok
 set datafusion.execution.target_partitions = 2;

From 250e71694b4f9789444e52b8cc12476dcbf35ac6 Mon Sep 17 00:00:00 2001
From: Devin D'Angelo <devinjdangelo@gmail.com>
Date: Sat, 28 Oct 2023 06:04:47 -0400
Subject: [PATCH 25/32] Support Partitioning Data by Dictionary Encoded String
 Array Types (#7896)

* support dictionary encoded string columns for partition cols

* remove debug prints

* cargo fmt

* generic dictionary cast and dict encoded test

* updates from review

* force retry checks

* try checks again
---
 datafusion/common/src/dfschema.rs             | 91 +++++++++++++++++++
 .../src/datasource/file_format/write/demux.rs | 18 +++-
 .../core/src/datasource/listing/table.rs      |  5 +-
 datafusion/core/src/datasource/memory.rs      |  5 +-
 .../test_files/insert_to_external.slt         | 38 +++++++-
 5 files changed, 153 insertions(+), 4 deletions(-)

diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index e16acbfedc81..d8cd103a4777 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -391,11 +391,33 @@ impl DFSchema {
             })
     }
 
+    /// Returns true if the two schemas have the same qualified named
+    /// fields with logically equivalent data types. Returns false otherwise.
+    ///
+    /// Use [DFSchema]::equivalent_names_and_types for stricter semantic type
+    /// equivalence checking.
+    pub fn logically_equivalent_names_and_types(&self, other: &Self) -> bool {
+        if self.fields().len() != other.fields().len() {
+            return false;
+        }
+        let self_fields = self.fields().iter();
+        let other_fields = other.fields().iter();
+        self_fields.zip(other_fields).all(|(f1, f2)| {
+            f1.qualifier() == f2.qualifier()
+                && f1.name() == f2.name()
+                && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type())
+        })
+    }
+
     /// Returns true if the two schemas have the same qualified named
     /// fields with the same data types. Returns false otherwise.
     ///
     /// This is a specialized version of Eq that ignores differences
     /// in nullability and metadata.
+    ///
+    /// Use [DFSchema]::logically_equivalent_names_and_types for a weaker
+    /// logical type checking, which for example would consider a dictionary
+    /// encoded UTF8 array to be equivalent to a plain UTF8 array.
     pub fn equivalent_names_and_types(&self, other: &Self) -> bool {
         if self.fields().len() != other.fields().len() {
             return false;
@@ -409,6 +431,46 @@ impl DFSchema {
         })
     }
 
+    /// Checks if two [`DataType`]s are logically equal. This is a notably weaker constraint
+    /// than datatype_is_semantically_equal in that a Dictionary<K,V> type is logically
+    /// equal to a plain V type, but not semantically equal. Dictionary<K1, V1> is also
+    /// logically equal to Dictionary<K2, V1>.
+    fn datatype_is_logically_equal(dt1: &DataType, dt2: &DataType) -> bool {
+        // check nested fields
+        match (dt1, dt2) {
+            (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => {
+                v1.as_ref() == v2.as_ref()
+            }
+            (DataType::Dictionary(_, v1), othertype) => v1.as_ref() == othertype,
+            (othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype,
+            (DataType::List(f1), DataType::List(f2))
+            | (DataType::LargeList(f1), DataType::LargeList(f2))
+            | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
+            | (DataType::Map(f1, _), DataType::Map(f2, _)) => {
+                Self::field_is_logically_equal(f1, f2)
+            }
+            (DataType::Struct(fields1), DataType::Struct(fields2)) => {
+                let iter1 = fields1.iter();
+                let iter2 = fields2.iter();
+                fields1.len() == fields2.len() &&
+                        // all fields have to be the same
+                    iter1
+                    .zip(iter2)
+                        .all(|(f1, f2)| Self::field_is_logically_equal(f1, f2))
+            }
+            (DataType::Union(fields1, _), DataType::Union(fields2, _)) => {
+                let iter1 = fields1.iter();
+                let iter2 = fields2.iter();
+                fields1.len() == fields2.len() &&
+                    // all fields have to be the same
+                    iter1
+                        .zip(iter2)
+                        .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_logically_equal(f1, f2))
+            }
+            _ => dt1 == dt2,
+        }
+    }
+
     /// Returns true of two [`DataType`]s are semantically equal (same
     /// name and type), ignoring both metadata and nullability.
     ///
@@ -456,6 +518,11 @@ impl DFSchema {
         }
     }
 
+    fn field_is_logically_equal(f1: &Field, f2: &Field) -> bool {
+        f1.name() == f2.name()
+            && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type())
+    }
+
     fn field_is_semantically_equal(f1: &Field, f2: &Field) -> bool {
         f1.name() == f2.name()
             && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type())
@@ -786,6 +853,13 @@ pub trait SchemaExt {
     ///
     /// It works the same as [`DFSchema::equivalent_names_and_types`].
     fn equivalent_names_and_types(&self, other: &Self) -> bool;
+
+    /// Returns true if the two schemas have the same qualified named
+    /// fields with logically equivalent data types. Returns false otherwise.
+    ///
+    /// Use [DFSchema]::equivalent_names_and_types for stricter semantic type
+    /// equivalence checking.
+    fn logically_equivalent_names_and_types(&self, other: &Self) -> bool;
 }
 
 impl SchemaExt for Schema {
@@ -805,6 +879,23 @@ impl SchemaExt for Schema {
                     )
             })
     }
+
+    fn logically_equivalent_names_and_types(&self, other: &Self) -> bool {
+        if self.fields().len() != other.fields().len() {
+            return false;
+        }
+
+        self.fields()
+            .iter()
+            .zip(other.fields().iter())
+            .all(|(f1, f2)| {
+                f1.name() == f2.name()
+                    && DFSchema::datatype_is_logically_equal(
+                        f1.data_type(),
+                        f2.data_type(),
+                    )
+            })
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/core/src/datasource/file_format/write/demux.rs b/datafusion/core/src/datasource/file_format/write/demux.rs
index 67dd1f940676..27c65dd459ec 100644
--- a/datafusion/core/src/datasource/file_format/write/demux.rs
+++ b/datafusion/core/src/datasource/file_format/write/demux.rs
@@ -29,7 +29,7 @@ use crate::physical_plan::SendableRecordBatchStream;
 
 use arrow_array::builder::UInt64Builder;
 use arrow_array::cast::AsArray;
-use arrow_array::{RecordBatch, StructArray};
+use arrow_array::{downcast_dictionary_array, RecordBatch, StringArray, StructArray};
 use arrow_schema::{DataType, Schema};
 use datafusion_common::cast::as_string_array;
 use datafusion_common::DataFusionError;
@@ -338,6 +338,22 @@ fn compute_partition_keys_by_row<'a>(
                     partition_values.push(array.value(i));
                 }
             }
+            DataType::Dictionary(_, _) => {
+                downcast_dictionary_array!(
+                    col_array =>  {
+                        let array = col_array.downcast_dict::<StringArray>()
+                            .ok_or(DataFusionError::Execution(format!("it is not yet supported to write to hive partitions with datatype {}",
+                            dtype)))?;
+
+                        for val in array.values() {
+                            partition_values.push(
+                                val.ok_or(DataFusionError::Execution(format!("Cannot partition by null value for column {}", col)))?
+                            );
+                        }
+                    },
+                    _ => unreachable!(),
+                )
+            }
             _ => {
                 return Err(DataFusionError::NotImplemented(format!(
                 "it is not yet supported to write to hive partitions with datatype {}",
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 822a78a5522a..d26d417bd8b2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -824,7 +824,10 @@ impl TableProvider for ListingTable {
         overwrite: bool,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         // Check that the schema of the plan matches the schema of this table.
-        if !self.schema().equivalent_names_and_types(&input.schema()) {
+        if !self
+            .schema()
+            .logically_equivalent_names_and_types(&input.schema())
+        {
             return plan_err!(
                 // Return an error if schema of the input query does not match with the table schema.
                 "Inserting query must have the same schema with the table."
diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs
index a2f8e225e121..6bcaa97a408f 100644
--- a/datafusion/core/src/datasource/memory.rs
+++ b/datafusion/core/src/datasource/memory.rs
@@ -209,7 +209,10 @@ impl TableProvider for MemTable {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         // Create a physical plan from the logical plan.
         // Check that the schema of the plan matches the schema of this table.
-        if !self.schema().equivalent_names_and_types(&input.schema()) {
+        if !self
+            .schema()
+            .logically_equivalent_names_and_types(&input.schema())
+        {
             return plan_err!(
                 "Inserting query must have the same schema with the table."
             );
diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt
index b2206e987864..8b01a14568e7 100644
--- a/datafusion/sqllogictest/test_files/insert_to_external.slt
+++ b/datafusion/sqllogictest/test_files/insert_to_external.slt
@@ -40,8 +40,44 @@ STORED AS CSV
 WITH HEADER ROW
 LOCATION '../../testing/data/csv/aggregate_test_100.csv'
 
-# test_insert_into
 
+statement ok
+create table dictionary_encoded_values as values 
+('a', arrow_cast('foo', 'Dictionary(Int32, Utf8)')), ('b', arrow_cast('bar', 'Dictionary(Int32, Utf8)'));
+
+query TTT
+describe dictionary_encoded_values;
+----
+column1 Utf8 YES
+column2 Dictionary(Int32, Utf8) YES
+
+statement ok
+CREATE EXTERNAL TABLE dictionary_encoded_parquet_partitioned(
+  a varchar,
+  b varchar,
+) 
+STORED AS parquet
+LOCATION 'test_files/scratch/insert_to_external/parquet_types_partitioned'
+PARTITIONED BY (b)
+OPTIONS(
+create_local_path 'true',
+insert_mode 'append_new_files',
+);
+
+query TT
+insert into dictionary_encoded_parquet_partitioned 
+select * from dictionary_encoded_values
+----
+2
+
+query TT
+select * from dictionary_encoded_parquet_partitioned order by (a);
+----
+a foo
+b bar
+
+
+# test_insert_into
 statement ok
 set datafusion.execution.target_partitions = 8;
 

From d28c79d1809fd545bd4f3e15951267b7b1ec222c Mon Sep 17 00:00:00 2001
From: Jay Zhan <jayzhan211@gmail.com>
Date: Sat, 28 Oct 2023 18:06:29 +0800
Subject: [PATCH 26/32] Minor: Remove array() in array_expression (#7961)

* remove array

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* cleanup others

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* clippy

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* cleanup cast

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* fmt

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* cleanup cast

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

---------

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>
---
 .../physical-expr/src/array_expressions.rs    | 391 ++++++++----------
 1 file changed, 172 insertions(+), 219 deletions(-)

diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs
index af4612272676..7077f8b59860 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -25,12 +25,14 @@ use arrow::buffer::OffsetBuffer;
 use arrow::compute;
 use arrow::datatypes::{DataType, Field, UInt64Type};
 use arrow_buffer::NullBuffer;
-use datafusion_common::cast::{as_generic_string_array, as_int64_array, as_list_array};
+
+use datafusion_common::cast::{
+    as_generic_string_array, as_int64_array, as_list_array, as_string_array,
+};
 use datafusion_common::utils::wrap_into_list_array;
 use datafusion_common::{
     exec_err, internal_err, not_impl_err, plan_err, DataFusionError, Result,
 };
-use datafusion_expr::ColumnarValue;
 
 use itertools::Itertools;
 
@@ -395,21 +397,10 @@ fn array_array(args: &[ArrayRef], data_type: DataType) -> Result<ArrayRef> {
     Ok(res)
 }
 
-/// Convert one or more [`ColumnarValue`] of the same type into a
-/// `ListArray`
-///
-/// See [`array_array`] for more details.
-fn array(values: &[ColumnarValue]) -> Result<ArrayRef> {
-    let arrays: Vec<ArrayRef> = values
-        .iter()
-        .map(|x| match x {
-            ColumnarValue::Array(array) => array.clone(),
-            ColumnarValue::Scalar(scalar) => scalar.to_array(),
-        })
-        .collect();
-
+/// `make_array` SQL function
+pub fn make_array(arrays: &[ArrayRef]) -> Result<ArrayRef> {
     let mut data_type = DataType::Null;
-    for arg in &arrays {
+    for arg in arrays {
         let arg_data_type = arg.data_type();
         if !arg_data_type.equals_datatype(&DataType::Null) {
             data_type = arg_data_type.clone();
@@ -423,19 +414,10 @@ fn array(values: &[ColumnarValue]) -> Result<ArrayRef> {
             let array = new_null_array(&DataType::Null, arrays.len());
             Ok(Arc::new(wrap_into_list_array(array)))
         }
-        data_type => array_array(arrays.as_slice(), data_type),
+        data_type => array_array(arrays, data_type),
     }
 }
 
-/// `make_array` SQL function
-pub fn make_array(arrays: &[ArrayRef]) -> Result<ArrayRef> {
-    let values: Vec<ColumnarValue> = arrays
-        .iter()
-        .map(|x| ColumnarValue::Array(x.clone()))
-        .collect();
-    array(values.as_slice())
-}
-
 fn return_empty(return_null: bool, data_type: DataType) -> Arc<dyn Array> {
     if return_null {
         new_null_array(&data_type, 1)
@@ -654,7 +636,7 @@ pub fn array_append(args: &[ArrayRef]) -> Result<ArrayRef> {
     check_datatypes("array_append", &[arr.values(), element])?;
     let res = match arr.value_type() {
         DataType::List(_) => concat_internal(args)?,
-        DataType::Null => return array(&[ColumnarValue::Array(args[1].clone())]),
+        DataType::Null => return make_array(&[element.to_owned()]),
         data_type => {
             macro_rules! array_function {
                 ($ARRAY_TYPE:ident) => {
@@ -728,7 +710,7 @@ pub fn array_prepend(args: &[ArrayRef]) -> Result<ArrayRef> {
     check_datatypes("array_prepend", &[element, arr.values()])?;
     let res = match arr.value_type() {
         DataType::List(_) => concat_internal(args)?,
-        DataType::Null => return array(&[ColumnarValue::Array(args[0].clone())]),
+        DataType::Null => return make_array(&[element.to_owned()]),
         data_type => {
             macro_rules! array_function {
                 ($ARRAY_TYPE:ident) => {
@@ -1479,15 +1461,13 @@ macro_rules! to_string {
 pub fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> {
     let arr = &args[0];
 
-    let delimiters = as_generic_string_array::<i32>(&args[1])?;
+    let delimiters = as_string_array(&args[1])?;
     let delimiters: Vec<Option<&str>> = delimiters.iter().collect();
 
     let mut null_string = String::from("");
     let mut with_null_string = false;
     if args.len() == 3 {
-        null_string = as_generic_string_array::<i32>(&args[2])?
-            .value(0)
-            .to_string();
+        null_string = as_string_array(&args[2])?.value(0).to_string();
         with_null_string = true;
     }
 
@@ -1941,29 +1921,23 @@ pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef
 mod tests {
     use super::*;
     use arrow::datatypes::Int64Type;
-    use datafusion_common::cast::{
-        as_generic_string_array, as_list_array, as_uint64_array,
-    };
-    use datafusion_common::scalar::ScalarValue;
+    use datafusion_common::cast::as_uint64_array;
 
     #[test]
     fn test_array() {
         // make_array(1, 2, 3) = [1, 2, 3]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            Arc::new(Int64Array::from(vec![1])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![2])),
+            Arc::new(Int64Array::from(vec![3])),
         ];
-        let array = array(&args).expect("failed to initialize function array");
+        let array = make_array(&args).expect("failed to initialize function array");
         let result = as_list_array(&array).expect("failed to initialize function array");
         assert_eq!(result.len(), 1);
         assert_eq!(
             &[1, 2, 3],
-            result
-                .value(0)
-                .as_any()
-                .downcast_ref::<Int64Array>()
-                .unwrap()
+            as_int64_array(&result.value(0))
+                .expect("failed to cast to primitive array")
                 .values()
         )
     }
@@ -1972,29 +1946,23 @@ mod tests {
     fn test_nested_array() {
         // make_array([1, 3, 5], [2, 4, 6]) = [[1, 3, 5], [2, 4, 6]]
         let args = [
-            ColumnarValue::Array(Arc::new(Int64Array::from(vec![1, 2]))),
-            ColumnarValue::Array(Arc::new(Int64Array::from(vec![3, 4]))),
-            ColumnarValue::Array(Arc::new(Int64Array::from(vec![5, 6]))),
+            Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![3, 4])),
+            Arc::new(Int64Array::from(vec![5, 6])),
         ];
-        let array = array(&args).expect("failed to initialize function array");
+        let array = make_array(&args).expect("failed to initialize function array");
         let result = as_list_array(&array).expect("failed to initialize function array");
         assert_eq!(result.len(), 2);
         assert_eq!(
             &[1, 3, 5],
-            result
-                .value(0)
-                .as_any()
-                .downcast_ref::<Int64Array>()
-                .unwrap()
+            as_int64_array(&result.value(0))
+                .expect("failed to cast to primitive array")
                 .values()
         );
         assert_eq!(
             &[2, 4, 6],
-            result
-                .value(1)
-                .as_any()
-                .downcast_ref::<Int64Array>()
-                .unwrap()
+            as_int64_array(&result.value(1))
+                .expect("failed to cast to primitive array")
                 .values()
         );
     }
@@ -2002,7 +1970,7 @@ mod tests {
     #[test]
     fn test_array_element() {
         // array_element([1, 2, 3, 4], 1) = 1
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(1, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2011,7 +1979,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from_value(1, 1));
 
         // array_element([1, 2, 3, 4], 3) = 3
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(3, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2020,7 +1988,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from_value(3, 1));
 
         // array_element([1, 2, 3, 4], 0) = NULL
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(0, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2029,7 +1997,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from(vec![None]));
 
         // array_element([1, 2, 3, 4], NULL) = NULL
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from(vec![None]))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2038,7 +2006,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from(vec![None]));
 
         // array_element([1, 2, 3, 4], -1) = 4
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(-1, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2047,7 +2015,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from_value(4, 1));
 
         // array_element([1, 2, 3, 4], -3) = 2
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(-3, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2056,7 +2024,7 @@ mod tests {
         assert_eq!(result, &Int64Array::from_value(2, 1));
 
         // array_element([1, 2, 3, 4], 10) = NULL
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(10, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2068,7 +2036,7 @@ mod tests {
     #[test]
     fn test_nested_array_element() {
         // array_element([[1, 2, 3, 4], [5, 6, 7, 8]], 2) = [5, 6, 7, 8]
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = array_element(&[list_array, Arc::new(Int64Array::from_value(2, 1))])
             .expect("failed to initialize function array_element");
         let result =
@@ -2088,7 +2056,7 @@ mod tests {
     #[test]
     fn test_array_pop_back() {
         // array_pop_back([1, 2, 3, 4]) = [1, 2, 3]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_pop_back(&[list_array])
             .expect("failed to initialize function array_pop_back");
         let result =
@@ -2167,7 +2135,7 @@ mod tests {
         );
 
         // array_pop_back([1, NULL, 3, NULL]) = [1, NULL, 3]
-        let list_array = return_array_with_nulls().into_array(1);
+        let list_array = return_array_with_nulls();
         let arr = array_pop_back(&[list_array])
             .expect("failed to initialize function array_pop_back");
         let result =
@@ -2185,7 +2153,7 @@ mod tests {
     #[test]
     fn test_nested_array_pop_back() {
         // array_pop_back([[1, 2, 3, 4], [5, 6, 7, 8]]) = [[1, 2, 3, 4]]
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = array_pop_back(&[list_array])
             .expect("failed to initialize function array_slice");
         let result =
@@ -2233,7 +2201,7 @@ mod tests {
     #[test]
     fn test_array_slice() {
         // array_slice([1, 2, 3, 4], 1, 3) = [1, 2, 3]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(1, 1)),
@@ -2254,7 +2222,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], 2, 2) = [2]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(2, 1)),
@@ -2275,7 +2243,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], 0, 0) = []
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(0, 1)),
@@ -2293,7 +2261,7 @@ mod tests {
             .is_empty());
 
         // array_slice([1, 2, 3, 4], 0, 6) = [1, 2, 3, 4]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(0, 1)),
@@ -2314,7 +2282,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], -2, -2) = []
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-2, 1)),
@@ -2332,7 +2300,7 @@ mod tests {
             .is_empty());
 
         // array_slice([1, 2, 3, 4], -3, -1) = [2, 3]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-3, 1)),
@@ -2353,7 +2321,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], -3, 2) = [2]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-3, 1)),
@@ -2374,7 +2342,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], 2, 11) = [2, 3, 4]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(2, 1)),
@@ -2395,7 +2363,7 @@ mod tests {
         );
 
         // array_slice([1, 2, 3, 4], 3, 1) = []
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(3, 1)),
@@ -2413,7 +2381,7 @@ mod tests {
             .is_empty());
 
         // array_slice([1, 2, 3, 4], -7, -2) = NULL
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-7, 1)),
@@ -2434,7 +2402,7 @@ mod tests {
     #[test]
     fn test_nested_array_slice() {
         // array_slice([[1, 2, 3, 4], [5, 6, 7, 8]], 1, 1) = [[1, 2, 3, 4]]
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(1, 1)),
@@ -2459,7 +2427,7 @@ mod tests {
         );
 
         // array_slice([[1, 2, 3, 4], [5, 6, 7, 8]], -1, -1) = []
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-1, 1)),
@@ -2477,7 +2445,7 @@ mod tests {
             .is_empty());
 
         // array_slice([[1, 2, 3, 4], [5, 6, 7, 8]], -1, 2) = [[5, 6, 7, 8]]
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = array_slice(&[
             list_array,
             Arc::new(Int64Array::from_value(-1, 1)),
@@ -2588,7 +2556,7 @@ mod tests {
     #[test]
     fn test_nested_array_concat() {
         // array_concat([1, 2, 3, 4], [1, 2, 3, 4]) = [1, 2, 3, 4, 1, 2, 3, 4]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_concat(&[list_array.clone(), list_array.clone()])
             .expect("failed to initialize function array_concat");
         let result =
@@ -2605,8 +2573,8 @@ mod tests {
         );
 
         // array_concat([[1, 2, 3, 4], [5, 6, 7, 8]], [1, 2, 3, 4]) = [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4]]
-        let list_nested_array = return_nested_array().into_array(1);
-        let list_array = return_array().into_array(1);
+        let list_nested_array = return_nested_array();
+        let list_array = return_array();
         let arr = array_concat(&[list_nested_array, list_array])
             .expect("failed to initialize function array_concat");
         let result =
@@ -2630,7 +2598,7 @@ mod tests {
     #[test]
     fn test_array_position() {
         // array_position([1, 2, 3, 4], 3) = 3
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let array = array_position(&[list_array, Arc::new(Int64Array::from_value(3, 1))])
             .expect("failed to initialize function array_position");
         let result = as_uint64_array(&array)
@@ -2642,7 +2610,7 @@ mod tests {
     #[test]
     fn test_array_positions() {
         // array_positions([1, 2, 3, 4], 3) = [3]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let array =
             array_positions(&[list_array, Arc::new(Int64Array::from_value(3, 1))])
                 .expect("failed to initialize function array_position");
@@ -2664,7 +2632,7 @@ mod tests {
     #[test]
     fn test_array_remove() {
         // array_remove([3, 1, 2, 3, 2, 3], 3) = [1, 2, 3, 2, 3]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array = array_remove(&[list_array, Arc::new(Int64Array::from_value(3, 1))])
             .expect("failed to initialize function array_remove");
         let result =
@@ -2688,8 +2656,8 @@ mod tests {
         //     [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [9, 10, 11, 12], [5, 6, 7, 8]],
         //     [1, 2, 3, 4],
         // ) = [[5, 6, 7, 8], [1, 2, 3, 4], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let element_array = return_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let element_array = return_array();
         let array = array_remove(&[list_array, element_array])
             .expect("failed to initialize function array_remove");
         let result =
@@ -2717,7 +2685,7 @@ mod tests {
     #[test]
     fn test_array_remove_n() {
         // array_remove_n([3, 1, 2, 3, 2, 3], 3, 2) = [1, 2, 2, 3]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array = array_remove_n(&[
             list_array,
             Arc::new(Int64Array::from_value(3, 1)),
@@ -2746,8 +2714,8 @@ mod tests {
         //     [1, 2, 3, 4],
         //     3,
         // ) = [[5, 6, 7, 8], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let element_array = return_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let element_array = return_array();
         let array = array_remove_n(&[
             list_array,
             element_array,
@@ -2778,7 +2746,7 @@ mod tests {
     #[test]
     fn test_array_remove_all() {
         // array_remove_all([3, 1, 2, 3, 2, 3], 3) = [1, 2, 2]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array =
             array_remove_all(&[list_array, Arc::new(Int64Array::from_value(3, 1))])
                 .expect("failed to initialize function array_remove_all");
@@ -2803,8 +2771,8 @@ mod tests {
         //     [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [9, 10, 11, 12], [5, 6, 7, 8]],
         //     [1, 2, 3, 4],
         // ) = [[5, 6, 7, 8], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let element_array = return_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let element_array = return_array();
         let array = array_remove_all(&[list_array, element_array])
             .expect("failed to initialize function array_remove_all");
         let result = as_list_array(&array)
@@ -2831,7 +2799,7 @@ mod tests {
     #[test]
     fn test_array_replace() {
         // array_replace([3, 1, 2, 3, 2, 3], 3, 4) = [4, 1, 2, 3, 2, 3]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array = array_replace(&[
             list_array,
             Arc::new(Int64Array::from_value(3, 1)),
@@ -2860,9 +2828,9 @@ mod tests {
         //     [1, 2, 3, 4],
         //     [11, 12, 13, 14],
         // ) = [[11, 12, 13, 14], [5, 6, 7, 8], [1, 2, 3, 4], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let from_array = return_array().into_array(1);
-        let to_array = return_extra_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let from_array = return_array();
+        let to_array = return_extra_array();
         let array = array_replace(&[list_array, from_array, to_array])
             .expect("failed to initialize function array_replace");
         let result =
@@ -2891,7 +2859,7 @@ mod tests {
     #[test]
     fn test_array_replace_n() {
         // array_replace_n([3, 1, 2, 3, 2, 3], 3, 4, 2) = [4, 1, 2, 4, 2, 3]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array = array_replace_n(&[
             list_array,
             Arc::new(Int64Array::from_value(3, 1)),
@@ -2922,9 +2890,9 @@ mod tests {
         //     [11, 12, 13, 14],
         //     2,
         // ) = [[11, 12, 13, 14], [5, 6, 7, 8], [11, 12, 13, 14], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let from_array = return_array().into_array(1);
-        let to_array = return_extra_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let from_array = return_array();
+        let to_array = return_extra_array();
         let array = array_replace_n(&[
             list_array,
             from_array,
@@ -2958,7 +2926,7 @@ mod tests {
     #[test]
     fn test_array_replace_all() {
         // array_replace_all([3, 1, 2, 3, 2, 3], 3, 4) = [4, 1, 2, 4, 2, 4]
-        let list_array = return_array_with_repeating_elements().into_array(1);
+        let list_array = return_array_with_repeating_elements();
         let array = array_replace_all(&[
             list_array,
             Arc::new(Int64Array::from_value(3, 1)),
@@ -2987,9 +2955,9 @@ mod tests {
         //     [1, 2, 3, 4],
         //     [11, 12, 13, 14],
         // ) = [[11, 12, 13, 14], [5, 6, 7, 8], [11, 12, 13, 14], [9, 10, 11, 12], [5, 6, 7, 8]]
-        let list_array = return_nested_array_with_repeating_elements().into_array(1);
-        let from_array = return_array().into_array(1);
-        let to_array = return_extra_array().into_array(1);
+        let list_array = return_nested_array_with_repeating_elements();
+        let from_array = return_array();
+        let to_array = return_extra_array();
         let array = array_replace_all(&[list_array, from_array, to_array])
             .expect("failed to initialize function array_replace_all");
         let result = as_list_array(&array)
@@ -3041,7 +3009,7 @@ mod tests {
     #[test]
     fn test_nested_array_repeat() {
         // array_repeat([1, 2, 3, 4], 3) = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
-        let element = return_array().into_array(1);
+        let element = return_array();
         let array = array_repeat(&[element, Arc::new(Int64Array::from_value(3, 1))])
             .expect("failed to initialize function array_repeat");
         let result =
@@ -3067,25 +3035,25 @@ mod tests {
     #[test]
     fn test_array_to_string() {
         // array_to_string([1, 2, 3, 4], ',') = 1,2,3,4
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let array =
             array_to_string(&[list_array, Arc::new(StringArray::from(vec![Some(",")]))])
                 .expect("failed to initialize function array_to_string");
-        let result = as_generic_string_array::<i32>(&array)
+        let result = as_string_array(&array)
             .expect("failed to initialize function array_to_string");
 
         assert_eq!(result.len(), 1);
         assert_eq!("1,2,3,4", result.value(0));
 
         // array_to_string([1, NULL, 3, NULL], ',', '*') = 1,*,3,*
-        let list_array = return_array_with_nulls().into_array(1);
+        let list_array = return_array_with_nulls();
         let array = array_to_string(&[
             list_array,
             Arc::new(StringArray::from(vec![Some(",")])),
             Arc::new(StringArray::from(vec![Some("*")])),
         ])
         .expect("failed to initialize function array_to_string");
-        let result = as_generic_string_array::<i32>(&array)
+        let result = as_string_array(&array)
             .expect("failed to initialize function array_to_string");
 
         assert_eq!(result.len(), 1);
@@ -3095,25 +3063,25 @@ mod tests {
     #[test]
     fn test_nested_array_to_string() {
         // array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], '-') = 1-2-3-4-5-6-7-8
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let array =
             array_to_string(&[list_array, Arc::new(StringArray::from(vec![Some("-")]))])
                 .expect("failed to initialize function array_to_string");
-        let result = as_generic_string_array::<i32>(&array)
+        let result = as_string_array(&array)
             .expect("failed to initialize function array_to_string");
 
         assert_eq!(result.len(), 1);
         assert_eq!("1-2-3-4-5-6-7-8", result.value(0));
 
         // array_to_string([[1, NULL, 3, NULL], [NULL, 6, 7, NULL]], '-', '*') = 1-*-3-*-*-6-7-*
-        let list_array = return_nested_array_with_nulls().into_array(1);
+        let list_array = return_nested_array_with_nulls();
         let array = array_to_string(&[
             list_array,
             Arc::new(StringArray::from(vec![Some("-")])),
             Arc::new(StringArray::from(vec![Some("*")])),
         ])
         .expect("failed to initialize function array_to_string");
-        let result = as_generic_string_array::<i32>(&array)
+        let result = as_string_array(&array)
             .expect("failed to initialize function array_to_string");
 
         assert_eq!(result.len(), 1);
@@ -3123,7 +3091,7 @@ mod tests {
     #[test]
     fn test_cardinality() {
         // cardinality([1, 2, 3, 4]) = 4
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = cardinality(&[list_array])
             .expect("failed to initialize function cardinality");
         let result =
@@ -3135,7 +3103,7 @@ mod tests {
     #[test]
     fn test_nested_cardinality() {
         // cardinality([[1, 2, 3, 4], [5, 6, 7, 8]]) = 8
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
         let arr = cardinality(&[list_array])
             .expect("failed to initialize function cardinality");
         let result =
@@ -3147,7 +3115,7 @@ mod tests {
     #[test]
     fn test_array_length() {
         // array_length([1, 2, 3, 4]) = 4
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
         let arr = array_length(&[list_array.clone()])
             .expect("failed to initialize function array_ndims");
         let result =
@@ -3166,7 +3134,7 @@ mod tests {
 
     #[test]
     fn test_nested_array_length() {
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
 
         // array_length([[1, 2, 3, 4], [5, 6, 7, 8]]) = 2
         let arr = array_length(&[list_array.clone()])
@@ -3206,7 +3174,7 @@ mod tests {
     #[test]
     fn test_array_dims() {
         // array_dims([1, 2, 3, 4]) = [4]
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
 
         let array =
             array_dims(&[list_array]).expect("failed to initialize function array_dims");
@@ -3227,7 +3195,7 @@ mod tests {
     #[test]
     fn test_nested_array_dims() {
         // array_dims([[1, 2, 3, 4], [5, 6, 7, 8]]) = [2, 4]
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
 
         let array =
             array_dims(&[list_array]).expect("failed to initialize function array_dims");
@@ -3248,7 +3216,7 @@ mod tests {
     #[test]
     fn test_array_ndims() {
         // array_ndims([1, 2, 3, 4]) = 1
-        let list_array = return_array().into_array(1);
+        let list_array = return_array();
 
         let array = array_ndims(&[list_array])
             .expect("failed to initialize function array_ndims");
@@ -3261,7 +3229,7 @@ mod tests {
     #[test]
     fn test_nested_array_ndims() {
         // array_ndims([[1, 2, 3, 4], [5, 6, 7, 8]]) = 2
-        let list_array = return_nested_array().into_array(1);
+        let list_array = return_nested_array();
 
         let array = array_ndims(&[list_array])
             .expect("failed to initialize function array_ndims");
@@ -3285,152 +3253,137 @@ mod tests {
         assert_eq!(array.unwrap_err().strip_backtrace(), "Error during planning: array_append received incompatible types: '[Int64, Utf8]'.");
     }
 
-    fn return_array() -> ColumnarValue {
+    fn return_array() -> ArrayRef {
         // Returns: [1, 2, 3, 4]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(4)])) as ArrayRef,
         ];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&args).expect("failed to initialize function array")
     }
 
-    fn return_extra_array() -> ColumnarValue {
+    fn return_extra_array() -> ArrayRef {
         // Returns: [11, 12, 13, 14]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(11))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(12))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(13))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(14))),
+            Arc::new(Int64Array::from(vec![Some(11)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(12)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(13)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(14)])) as ArrayRef,
         ];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&args).expect("failed to initialize function array")
     }
 
-    fn return_nested_array() -> ColumnarValue {
+    fn return_nested_array() -> ArrayRef {
         // Returns: [[1, 2, 3, 4], [5, 6, 7, 8]]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(4)])) as ArrayRef,
         ];
-        let arr1 = array(&args).expect("failed to initialize function array");
+        let arr1 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(5))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(6))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(7))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(8))),
+            Arc::new(Int64Array::from(vec![Some(5)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(6)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(7)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(8)])) as ArrayRef,
         ];
-        let arr2 = array(&args).expect("failed to initialize function array");
+        let arr2 = make_array(&args).expect("failed to initialize function array");
 
-        let args = [ColumnarValue::Array(arr1), ColumnarValue::Array(arr2)];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&[arr1, arr2]).expect("failed to initialize function array")
     }
 
-    fn return_array_with_nulls() -> ColumnarValue {
+    fn return_array_with_nulls() -> ArrayRef {
         // Returns: [1, NULL, 3, NULL]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Null),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Null),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
         ];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&args).expect("failed to initialize function array")
     }
 
-    fn return_nested_array_with_nulls() -> ColumnarValue {
+    fn return_nested_array_with_nulls() -> ArrayRef {
         // Returns: [[1, NULL, 3, NULL], [NULL, 6, 7, NULL]]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Null),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Null),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
         ];
-        let arr1 = array(&args).expect("failed to initialize function array");
+        let arr1 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Null),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(6))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(7))),
-            ColumnarValue::Scalar(ScalarValue::Null),
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(6)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(7)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![None])) as ArrayRef,
         ];
-        let arr2 = array(&args).expect("failed to initialize function array");
+        let arr2 = make_array(&args).expect("failed to initialize function array");
 
-        let args = [ColumnarValue::Array(arr1), ColumnarValue::Array(arr2)];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&[arr1, arr2]).expect("failed to initialize function array")
     }
 
-    fn return_array_with_repeating_elements() -> ColumnarValue {
+    fn return_array_with_repeating_elements() -> ArrayRef {
         // Returns: [3, 1, 2, 3, 2, 3]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
         ];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&args).expect("failed to initialize function array")
     }
 
-    fn return_nested_array_with_repeating_elements() -> ColumnarValue {
+    fn return_nested_array_with_repeating_elements() -> ArrayRef {
         // Returns: [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [9, 10, 11, 12], [5, 6, 7, 8]]
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(4)])) as ArrayRef,
         ];
-        let arr1 = array(&args).expect("failed to initialize function array");
+        let arr1 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(5))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(6))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(7))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(8))),
+            Arc::new(Int64Array::from(vec![Some(5)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(6)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(7)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(8)])) as ArrayRef,
         ];
-        let arr2 = array(&args).expect("failed to initialize function array");
+        let arr2 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+            Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(2)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(3)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(4)])) as ArrayRef,
         ];
-        let arr3 = array(&args).expect("failed to initialize function array");
+        let arr3 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(9))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(10))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(11))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(12))),
+            Arc::new(Int64Array::from(vec![Some(9)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(10)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(11)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(12)])) as ArrayRef,
         ];
-        let arr4 = array(&args).expect("failed to initialize function array");
+        let arr4 = make_array(&args).expect("failed to initialize function array");
 
         let args = [
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(5))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(6))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(7))),
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(8))),
+            Arc::new(Int64Array::from(vec![Some(5)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(6)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(7)])) as ArrayRef,
+            Arc::new(Int64Array::from(vec![Some(8)])) as ArrayRef,
         ];
-        let arr5 = array(&args).expect("failed to initialize function array");
+        let arr5 = make_array(&args).expect("failed to initialize function array");
 
-        let args = [
-            ColumnarValue::Array(arr1),
-            ColumnarValue::Array(arr2),
-            ColumnarValue::Array(arr3),
-            ColumnarValue::Array(arr4),
-            ColumnarValue::Array(arr5),
-        ];
-        let result = array(&args).expect("failed to initialize function array");
-        ColumnarValue::Array(result.clone())
+        make_array(&[arr1, arr2, arr3, arr4, arr5])
+            .expect("failed to initialize function array")
     }
 }

From b02fe5bbd7854bf186c54c542b80ab75ac323c38 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sat, 28 Oct 2023 11:40:34 -0400
Subject: [PATCH 27/32] Minor: simplify update code (#7943)

---
 datafusion/sql/src/statement.rs | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index b5196c086638..80a27db6e63d 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -36,7 +36,6 @@ use datafusion_common::{
     Result, SchemaReference, TableReference, ToDFSchema,
 };
 use datafusion_expr::dml::{CopyOptions, CopyTo};
-use datafusion_expr::expr::Placeholder;
 use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check;
 use datafusion_expr::logical_plan::builder::project;
 use datafusion_expr::logical_plan::DdlStatement;
@@ -1025,20 +1024,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         // Projection
         let mut exprs = vec![];
         for (col_name, expr, dt) in values_and_types.into_iter() {
-            let expr = self.sql_to_expr(expr, &table_schema, &mut planner_context)?;
-            let expr = match expr {
-                datafusion_expr::Expr::Placeholder(Placeholder {
-                    ref id,
-                    ref data_type,
-                }) => match data_type {
-                    None => datafusion_expr::Expr::Placeholder(Placeholder::new(
-                        id.clone(),
-                        Some(dt.clone()),
-                    )),
-                    Some(_) => expr,
-                },
-                _ => expr,
-            };
+            let mut expr = self.sql_to_expr(expr, &table_schema, &mut planner_context)?;
+            // Update placeholder's datatype to the type of the target column
+            if let datafusion_expr::Expr::Placeholder(placeholder) = &mut expr {
+                placeholder.data_type =
+                    placeholder.data_type.take().or_else(|| Some(dt.clone()));
+            }
+            // Cast to target column type, if necessary
             let expr = expr.cast_to(dt, source.schema())?.alias(col_name);
             exprs.push(expr);
         }

From 9ee055a3f59ec08de3432a3ec3de8ff55d29975a Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Sat, 28 Oct 2023 09:45:16 -0600
Subject: [PATCH 28/32] Add some initial content about creating logical plans
 (#7952)

---
 Cargo.toml                                    |   1 +
 .../src/datasource/default_table_source.rs    |   6 +-
 datafusion/core/src/prelude.rs                |   2 +-
 dev/update_datafusion_versions.py             |   1 +
 docs/Cargo.toml                               |  32 +++++
 .../building-logical-plans.md                 | 129 +++++++++++++++++-
 docs/src/lib.rs                               |  19 +++
 docs/src/library_logical_plan.rs              |  78 +++++++++++
 8 files changed, 264 insertions(+), 4 deletions(-)
 create mode 100644 docs/Cargo.toml
 create mode 100644 docs/src/lib.rs
 create mode 100644 docs/src/library_logical_plan.rs

diff --git a/Cargo.toml b/Cargo.toml
index 71088e7fc7ad..77e3c6038ea7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ members = [
     "datafusion/substrait",
     "datafusion/wasmtest",
     "datafusion-examples",
+    "docs",
     "test-utils",
     "benchmarks",
 ]
diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs
index f93faa50a9b9..00a9c123ceee 100644
--- a/datafusion/core/src/datasource/default_table_source.rs
+++ b/datafusion/core/src/datasource/default_table_source.rs
@@ -26,10 +26,12 @@ use arrow::datatypes::SchemaRef;
 use datafusion_common::{internal_err, Constraints, DataFusionError};
 use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource};
 
-/// DataFusion default table source, wrapping TableProvider
+/// DataFusion default table source, wrapping TableProvider.
 ///
 /// This structure adapts a `TableProvider` (physical plan trait) to the `TableSource`
-/// (logical plan trait)
+/// (logical plan trait) and is necessary because the logical plan is contained in
+/// the `datafusion_expr` crate, and is not aware of table providers, which exist in
+/// the core `datafusion` crate.
 pub struct DefaultTableSource {
     /// table provider
     pub table_provider: Arc<dyn TableProvider>,
diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs
index 7689468e5d13..5cd8b3870f81 100644
--- a/datafusion/core/src/prelude.rs
+++ b/datafusion/core/src/prelude.rs
@@ -13,7 +13,7 @@
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
-// under the License.pub},
+// under the License.
 
 //! DataFusion "prelude" to simplify importing common types.
 //!
diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py
index 7cbe39fdfb66..19701b813671 100755
--- a/dev/update_datafusion_versions.py
+++ b/dev/update_datafusion_versions.py
@@ -43,6 +43,7 @@
     'datafusion-wasmtest': 'datafusion/wasmtest/Cargo.toml',
     'datafusion-benchmarks': 'benchmarks/Cargo.toml',
     'datafusion-examples': 'datafusion-examples/Cargo.toml',
+    'datafusion-docs': 'docs/Cargo.toml',
 }
 
 def update_workspace_version(new_version: str):
diff --git a/docs/Cargo.toml b/docs/Cargo.toml
new file mode 100644
index 000000000000..9caa0bde3608
--- /dev/null
+++ b/docs/Cargo.toml
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion-docs-tests"
+description = "DataFusion Documentation Tests"
+publish = false
+version = { workspace = true }
+edition = { workspace = true }
+readme = { workspace = true }
+homepage = { workspace = true }
+repository = { workspace = true }
+license = { workspace = true }
+authors = { workspace = true }
+rust-version = "1.70"
+
+[dependencies]
+datafusion = { path = "../datafusion/core", version = "32.0.0", default-features = false }
diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md
index 406f4881129c..fe922d8eaeb1 100644
--- a/docs/source/library-user-guide/building-logical-plans.md
+++ b/docs/source/library-user-guide/building-logical-plans.md
@@ -19,4 +19,131 @@
 
 # Building Logical Plans
 
-Coming Soon
+A logical plan is a structured representation of a database query that describes the high-level operations and
+transformations needed to retrieve data from a database or data source. It abstracts away specific implementation
+details and focuses on the logical flow of the query, including operations like filtering, sorting, and joining tables.
+
+This logical plan serves as an intermediate step before generating an optimized physical execution plan. This is
+explained in more detail in the [Query Planning and Execution Overview] section of the [Architecture Guide].
+
+## Building Logical Plans Manually
+
+DataFusion's [LogicalPlan] is an enum containing variants representing all the supported operators, and also
+contains an `Extension` variant that allows projects building on DataFusion to add custom logical operators.
+
+It is possible to create logical plans by directly creating instances of the [LogicalPlan] enum as follows, but is is
+much easier to use the [LogicalPlanBuilder], which is described in the next section.
+
+Here is an example of building a logical plan directly:
+
+<!-- source for this example is in datafusion_docs::library_logical_plan::plan_1 -->
+
+```rust
+// create a logical table source
+let schema = Schema::new(vec![
+    Field::new("id", DataType::Int32, true),
+    Field::new("name", DataType::Utf8, true),
+]);
+let table_source = LogicalTableSource::new(SchemaRef::new(schema));
+
+// create a TableScan plan
+let projection = None; // optional projection
+let filters = vec![]; // optional filters to push down
+let fetch = None; // optional LIMIT
+let table_scan = LogicalPlan::TableScan(TableScan::try_new(
+    "person",
+    Arc::new(table_source),
+    projection,
+    filters,
+    fetch,
+)?);
+
+// create a Filter plan that evaluates `id > 500` that wraps the TableScan
+let filter_expr = col("id").gt(lit(500));
+let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?);
+
+// print the plan
+println!("{}", plan.display_indent_schema());
+```
+
+This example produces the following plan:
+
+```
+Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]
+  TableScan: person [id:Int32;N, name:Utf8;N]
+```
+
+## Building Logical Plans with LogicalPlanBuilder
+
+DataFusion logical plans can be created using the [LogicalPlanBuilder] struct. There is also a [DataFrame] API which is
+a higher-level API that delegates to [LogicalPlanBuilder].
+
+The following associated functions can be used to create a new builder:
+
+- `empty` - create an empty plan with no fields
+- `values` - create a plan from a set of literal values
+- `scan` - create a plan representing a table scan
+- `scan_with_filters` - create a plan representing a table scan with filters
+
+Once the builder is created, transformation methods can be called to declare that further operations should be
+performed on the plan. Note that all we are doing at this stage is building up the logical plan structure. No query
+execution will be performed.
+
+Here are some examples of transformation methods, but for a full list, refer to the [LogicalPlanBuilder] API documentation.
+
+- `filter`
+- `limit`
+- `sort`
+- `distinct`
+- `join`
+
+The following example demonstrates building the same simple query plan as the previous example, with a table scan followed by a filter.
+
+<!-- source for this example is in datafusion_docs::library_logical_plan::plan_builder_1 -->
+
+```rust
+// create a logical table source
+let schema = Schema::new(vec![
+    Field::new("id", DataType::Int32, true),
+    Field::new("name", DataType::Utf8, true),
+]);
+let table_source = LogicalTableSource::new(SchemaRef::new(schema));
+
+// optional projection
+let projection = None;
+
+// create a LogicalPlanBuilder for a table scan
+let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?;
+
+// perform a filter operation and build the plan
+let plan = builder
+    .filter(col("id").gt(lit(500)))? // WHERE id > 500
+    .build()?;
+
+// print the plan
+println!("{}", plan.display_indent_schema());
+```
+
+This example produces the following plan:
+
+```
+Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]
+  TableScan: person [id:Int32;N, name:Utf8;N]
+```
+
+## Table Sources
+
+The previous example used a [LogicalTableSource], which is used for tests and documentation in DataFusion, and is also
+suitable if you are using DataFusion to build logical plans but do not use DataFusion's physical planner. However, if you
+want to use a [TableSource] that can be executed in DataFusion then you will need to use [DefaultTableSource], which is a
+wrapper for a [TableProvider].
+
+[query planning and execution overview]: https://docs.rs/datafusion/latest/datafusion/index.html#query-planning-and-execution-overview
+[architecture guide]: https://docs.rs/datafusion/latest/datafusion/index.html#architecture
+[logicalplan]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/enum.LogicalPlan.html
+[logicalplanbuilder]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalPlanBuilder.html
+[dataframe]: using-the-dataframe-api.md
+[logicaltablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html
+[defaulttablesource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html
+[tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html
+[tablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html
diff --git a/docs/src/lib.rs b/docs/src/lib.rs
new file mode 100644
index 000000000000..f73132468ec9
--- /dev/null
+++ b/docs/src/lib.rs
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod library_logical_plan;
diff --git a/docs/src/library_logical_plan.rs b/docs/src/library_logical_plan.rs
new file mode 100644
index 000000000000..355003941570
--- /dev/null
+++ b/docs/src/library_logical_plan.rs
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::error::Result;
+use datafusion::logical_expr::builder::LogicalTableSource;
+use datafusion::logical_expr::{Filter, LogicalPlan, LogicalPlanBuilder, TableScan};
+use datafusion::prelude::*;
+use std::sync::Arc;
+
+#[test]
+fn plan_1() -> Result<()> {
+    // create a logical table source
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, true),
+        Field::new("name", DataType::Utf8, true),
+    ]);
+    let table_source = LogicalTableSource::new(SchemaRef::new(schema));
+
+    // create a TableScan plan
+    let projection = None; // optional projection
+    let filters = vec![]; // optional filters to push down
+    let fetch = None; // optional LIMIT
+    let table_scan = LogicalPlan::TableScan(TableScan::try_new(
+        "person",
+        Arc::new(table_source),
+        projection,
+        filters,
+        fetch,
+    )?);
+
+    // create a Filter plan that evaluates `id > 500` and wraps the TableScan
+    let filter_expr = col("id").gt(lit(500));
+    let plan = LogicalPlan::Filter(Filter::try_new(filter_expr, Arc::new(table_scan))?);
+
+    // print the plan
+    println!("{}", plan.display_indent_schema());
+
+    Ok(())
+}
+
+#[test]
+fn plan_builder_1() -> Result<()> {
+    // create a logical table source
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, true),
+        Field::new("name", DataType::Utf8, true),
+    ]);
+    let table_source = LogicalTableSource::new(SchemaRef::new(schema));
+
+    // optional projection
+    let projection = None;
+
+    // create a LogicalPlanBuilder for a table scan
+    let builder = LogicalPlanBuilder::scan("person", Arc::new(table_source), projection)?;
+
+    // perform a filter that evaluates `id > 500`, and build the plan
+    let plan = builder.filter(col("id").gt(lit(500)))?.build()?;
+
+    // print the plan
+    println!("{}", plan.display_indent_schema());
+
+    Ok(())
+}

From d24228a6b9552f2dce166836e60c1b928116933f Mon Sep 17 00:00:00 2001
From: yi wang <48236141+my-vegetable-has-exploded@users.noreply.github.com>
Date: Sun, 29 Oct 2023 19:34:29 +0800
Subject: [PATCH 29/32] Minor: Change from `&mut SessionContext` to
 `&SessionContext` in substrait (#7965)

* Lower &mut SessionContext in substrait

* rm mut ctx in tests
---
 .../substrait/src/logical_plan/consumer.rs    |  4 +--
 .../substrait/src/physical_plan/consumer.rs   |  2 +-
 .../tests/cases/roundtrip_logical_plan.rs     | 30 +++++++++----------
 .../tests/cases/roundtrip_physical_plan.rs    |  4 +--
 datafusion/substrait/tests/cases/serialize.rs |  4 +--
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs
index ae65a2c7d94a..a15121652452 100644
--- a/datafusion/substrait/src/logical_plan/consumer.rs
+++ b/datafusion/substrait/src/logical_plan/consumer.rs
@@ -177,7 +177,7 @@ fn split_eq_and_noneq_join_predicate_with_nulls_equality(
 
 /// Convert Substrait Plan to DataFusion DataFrame
 pub async fn from_substrait_plan(
-    ctx: &mut SessionContext,
+    ctx: &SessionContext,
     plan: &Plan,
 ) -> Result<LogicalPlan> {
     // Register function extension
@@ -219,7 +219,7 @@ pub async fn from_substrait_plan(
 /// Convert Substrait Rel to DataFusion DataFrame
 #[async_recursion]
 pub async fn from_substrait_rel(
-    ctx: &mut SessionContext,
+    ctx: &SessionContext,
     rel: &Rel,
     extensions: &HashMap<u32, &String>,
 ) -> Result<LogicalPlan> {
diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs
index 7788ba0a69de..1dab1f9d5e39 100644
--- a/datafusion/substrait/src/physical_plan/consumer.rs
+++ b/datafusion/substrait/src/physical_plan/consumer.rs
@@ -38,7 +38,7 @@ use substrait::proto::{
 /// Convert Substrait Rel to DataFusion ExecutionPlan
 #[async_recursion]
 pub async fn from_substrait_rel(
-    _ctx: &mut SessionContext,
+    _ctx: &SessionContext,
     rel: &Rel,
     _extensions: &HashMap<u32, &String>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index 32416125de24..ca2b4d48c460 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -606,7 +606,7 @@ async fn new_test_grammar() -> Result<()> {
 
 #[tokio::test]
 async fn extension_logical_plan() -> Result<()> {
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
     let validation_bytes = "MockUserDefinedLogicalPlan".as_bytes().to_vec();
     let ext_plan = LogicalPlan::Extension(Extension {
         node: Arc::new(MockUserDefinedLogicalPlan {
@@ -617,7 +617,7 @@ async fn extension_logical_plan() -> Result<()> {
     });
 
     let proto = to_substrait_plan(&ext_plan, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
 
     let plan1str = format!("{ext_plan:?}");
     let plan2str = format!("{plan2:?}");
@@ -712,11 +712,11 @@ async fn verify_post_join_filter_value(proto: Box<Plan>) -> Result<()> {
 }
 
 async fn assert_expected_plan(sql: &str, expected_plan_str: &str) -> Result<()> {
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
     let proto = to_substrait_plan(&plan, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
     let plan2 = ctx.state().optimize(&plan2)?;
     let plan2str = format!("{plan2:?}");
     assert_eq!(expected_plan_str, &plan2str);
@@ -724,11 +724,11 @@ async fn assert_expected_plan(sql: &str, expected_plan_str: &str) -> Result<()>
 }
 
 async fn roundtrip_fill_na(sql: &str) -> Result<()> {
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan1 = df.into_optimized_plan()?;
     let proto = to_substrait_plan(&plan1, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
     let plan2 = ctx.state().optimize(&plan2)?;
 
     // Format plan string and replace all None's with 0
@@ -743,15 +743,15 @@ async fn test_alias(sql_with_alias: &str, sql_no_alias: &str) -> Result<()> {
     // Since we ignore the SubqueryAlias in the producer, the result should be
     // the same as producing a Substrait plan from the same query without aliases
     // sql_with_alias -> substrait -> logical plan = sql_no_alias -> substrait -> logical plan
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
 
     let df_a = ctx.sql(sql_with_alias).await?;
     let proto_a = to_substrait_plan(&df_a.into_optimized_plan()?, &ctx)?;
-    let plan_with_alias = from_substrait_plan(&mut ctx, &proto_a).await?;
+    let plan_with_alias = from_substrait_plan(&ctx, &proto_a).await?;
 
     let df = ctx.sql(sql_no_alias).await?;
     let proto = to_substrait_plan(&df.into_optimized_plan()?, &ctx)?;
-    let plan = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan = from_substrait_plan(&ctx, &proto).await?;
 
     println!("{plan_with_alias:#?}");
     println!("{plan:#?}");
@@ -763,11 +763,11 @@ async fn test_alias(sql_with_alias: &str, sql_no_alias: &str) -> Result<()> {
 }
 
 async fn roundtrip(sql: &str) -> Result<()> {
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
     let proto = to_substrait_plan(&plan, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
     let plan2 = ctx.state().optimize(&plan2)?;
 
     println!("{plan:#?}");
@@ -780,11 +780,11 @@ async fn roundtrip(sql: &str) -> Result<()> {
 }
 
 async fn roundtrip_verify_post_join_filter(sql: &str) -> Result<()> {
-    let mut ctx = create_context().await?;
+    let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
     let proto = to_substrait_plan(&plan, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
     let plan2 = ctx.state().optimize(&plan2)?;
 
     println!("{plan:#?}");
@@ -799,11 +799,11 @@ async fn roundtrip_verify_post_join_filter(sql: &str) -> Result<()> {
 }
 
 async fn roundtrip_all_types(sql: &str) -> Result<()> {
-    let mut ctx = create_all_type_context().await?;
+    let ctx = create_all_type_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
     let proto = to_substrait_plan(&plan, &ctx)?;
-    let plan2 = from_substrait_plan(&mut ctx, &proto).await?;
+    let plan2 = from_substrait_plan(&ctx, &proto).await?;
     let plan2 = ctx.state().optimize(&plan2)?;
 
     println!("{plan:#?}");
diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
index 3e5e757e4c39..b64dd2c138fc 100644
--- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
@@ -62,10 +62,10 @@ async fn parquet_exec() -> Result<()> {
     let substrait_rel =
         producer::to_substrait_rel(parquet_exec.as_ref(), &mut extension_info)?;
 
-    let mut ctx = SessionContext::new();
+    let ctx = SessionContext::new();
 
     let parquet_exec_roundtrip =
-        consumer::from_substrait_rel(&mut ctx, substrait_rel.as_ref(), &HashMap::new())
+        consumer::from_substrait_rel(&ctx, substrait_rel.as_ref(), &HashMap::new())
             .await?;
 
     let expected = format!("{}", displayable(parquet_exec.as_ref()).indent(true));
diff --git a/datafusion/substrait/tests/cases/serialize.rs b/datafusion/substrait/tests/cases/serialize.rs
index d6dc5d7e58f2..f6736ca22279 100644
--- a/datafusion/substrait/tests/cases/serialize.rs
+++ b/datafusion/substrait/tests/cases/serialize.rs
@@ -30,7 +30,7 @@ mod tests {
 
     #[tokio::test]
     async fn serialize_simple_select() -> Result<()> {
-        let mut ctx = create_context().await?;
+        let ctx = create_context().await?;
         let path = "tests/simple_select.bin";
         let sql = "SELECT a, b FROM data";
         // Test reference
@@ -42,7 +42,7 @@ mod tests {
         // Read substrait plan from file
         let proto = serializer::deserialize(path).await?;
         // Check plan equality
-        let plan = from_substrait_plan(&mut ctx, &proto).await?;
+        let plan = from_substrait_plan(&ctx, &proto).await?;
         let plan_str_ref = format!("{plan_ref:?}");
         let plan_str = format!("{plan:?}");
         assert_eq!(plan_str_ref, plan_str);

From f388a2bbce4b50fa41a81613929880af598ccd04 Mon Sep 17 00:00:00 2001
From: Jeffrey <22608443+Jefffrey@users.noreply.github.com>
Date: Sun, 29 Oct 2023 23:49:36 +1100
Subject: [PATCH 30/32] Fix crate READMEs (#7964)

---
 datafusion-examples/Cargo.toml      | 2 +-
 datafusion/common/Cargo.toml        | 2 +-
 datafusion/expr/Cargo.toml          | 2 +-
 datafusion/optimizer/Cargo.toml     | 2 +-
 datafusion/physical-expr/Cargo.toml | 2 +-
 datafusion/proto/Cargo.toml         | 2 +-
 datafusion/sql/Cargo.toml           | 2 +-
 datafusion/sqllogictest/Cargo.toml  | 2 +-
 datafusion/substrait/Cargo.toml     | 2 +-
 datafusion/wasmtest/Cargo.toml      | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index 8d504f834bc5..7350c4ab981f 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -20,9 +20,9 @@ name = "datafusion-examples"
 description = "DataFusion usage examples"
 keywords = ["arrow", "query", "sql"]
 publish = false
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index 490fbeacad85..87087c50a2d2 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-common"
 description = "Common functionality for DataFusion query engine"
 keywords = ["arrow", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index c5cf6a1ac11f..a7919a557ad7 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-expr"
 description = "Logical plan and expression representation for DataFusion query engine"
 keywords = ["datafusion", "logical", "plan", "expressions"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index bf786686f474..dce05058b826 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-optimizer"
 description = "DataFusion Query Optimizer"
 keywords = [ "datafusion", "query", "optimizer" ]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 6269f27310a6..f7c0221756fd 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-physical-expr"
 description = "Physical expression implementation for DataFusion query engine"
 keywords = ["arrow", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index 32e10e58a7d7..5e52dadc0b18 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-proto"
 description = "Protobuf serialization of DataFusion logical plan expressions"
 keywords = ["arrow", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index a00a7f021352..fe60feb6ab1f 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -19,9 +19,9 @@
 name = "datafusion-sql"
 description = "DataFusion SQL Query Planner"
 keywords = ["datafusion", "sql", "parser", "planner"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml
index 454f99942f52..f1a730351417 100644
--- a/datafusion/sqllogictest/Cargo.toml
+++ b/datafusion/sqllogictest/Cargo.toml
@@ -21,7 +21,7 @@ edition = { workspace = true }
 homepage = { workspace = true }
 license = { workspace = true }
 name = "datafusion-sqllogictest"
-readme = { workspace = true }
+readme = "README.md"
 repository = { workspace = true }
 rust-version = { workspace = true }
 version = { workspace = true }
diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml
index 7c4ff868cfcd..67d31caca260 100644
--- a/datafusion/substrait/Cargo.toml
+++ b/datafusion/substrait/Cargo.toml
@@ -18,9 +18,9 @@
 [package]
 name = "datafusion-substrait"
 description = "DataFusion Substrait Producer and Consumer"
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml
index e1a9a5d41a5a..3195c989e223 100644
--- a/datafusion/wasmtest/Cargo.toml
+++ b/datafusion/wasmtest/Cargo.toml
@@ -18,9 +18,9 @@
 [package]
 name = "datafusion-wasmtest"
 description = "Test library to compile datafusion crates to wasm"
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }

From 4a91ce91e2e82ed33142405a00f619cc714a5a30 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sun, 29 Oct 2023 08:50:10 -0400
Subject: [PATCH 31/32] Minor: Improve `HashJoinExec` documentation (#7953)

* Minor: Improve `HashJoinExec` documentation

* Apply suggestions from code review

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

---------

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../physical-plan/src/joins/hash_join.rs      | 138 ++++++++++++++++--
 1 file changed, 126 insertions(+), 12 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs
index 9aa776fe054c..dc0e81a6f36e 100644
--- a/datafusion/physical-plan/src/joins/hash_join.rs
+++ b/datafusion/physical-plan/src/joins/hash_join.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Defines the join plan for executing partitions in parallel and then joining the results
-//! into a set of partitions.
+//! [`HashJoinExec`] Partitioned Hash Join Operator
 
 use std::fmt;
 use std::mem::size_of;
@@ -78,29 +77,140 @@ use futures::{ready, Stream, StreamExt, TryStreamExt};
 
 type JoinLeftData = (JoinHashMap, RecordBatch, MemoryReservation);
 
-/// Join execution plan executes partitions in parallel and combines them into a set of
-/// partitions.
+/// Join execution plan: Evaluates eqijoin predicates in parallel on multiple
+/// partitions using a hash table and an optional filter list to apply post
+/// join.
 ///
-/// Filter expression expected to contain non-equality predicates that can not be pushed
-/// down to any of join inputs.
-/// In case of outer join, filter applied to only matched rows.
+/// # Join Expressions
+///
+/// This implementation is optimized for evaluating eqijoin predicates  (
+/// `<col1> = <col2>`) expressions, which are represented as a list of `Columns`
+/// in [`Self::on`].
+///
+/// Non-equality predicates, which can not pushed down to a join inputs (e.g.
+/// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
+/// after the equijoin predicates.
+///
+/// # "Build Side" vs "Probe Side"
+///
+/// HashJoin takes two inputs, which are referred to as the "build" and the
+/// "probe". The build side is the first child, and the probe side is the second
+/// child.
+///
+/// The two inputs are treated differently and it is VERY important that the
+/// *smaller* input is placed on the build side to minimize the work of creating
+/// the hash table.
+///
+/// ```text
+///          ┌───────────┐
+///          │ HashJoin  │
+///          │           │
+///          └───────────┘
+///              │   │
+///        ┌─────┘   └─────┐
+///        ▼               ▼
+/// ┌────────────┐  ┌─────────────┐
+/// │   Input    │  │    Input    │
+/// │    [0]     │  │     [1]     │
+/// └────────────┘  └─────────────┘
+///
+///  "build side"    "probe side"
+/// ```
+///
+/// Execution proceeds in 2 stages:
+///
+/// 1. the **build phase** where a hash table is created from the tuples of the
+/// build side.
+///
+/// 2. the **probe phase** where the tuples of the probe side are streamed
+/// through, checking for matches of the join keys in the hash table.
+///
+/// ```text
+///                 ┌────────────────┐          ┌────────────────┐
+///                 │ ┌─────────┐    │          │ ┌─────────┐    │
+///                 │ │  Hash   │    │          │ │  Hash   │    │
+///                 │ │  Table  │    │          │ │  Table  │    │
+///                 │ │(keys are│    │          │ │(keys are│    │
+///                 │ │equi join│    │          │ │equi join│    │  Stage 2: batches from
+///  Stage 1: the   │ │columns) │    │          │ │columns) │    │    the probe side are
+/// *entire* build  │ │         │    │          │ │         │    │  streamed through, and
+///  side is read   │ └─────────┘    │          │ └─────────┘    │   checked against the
+/// into the hash   │      ▲         │          │          ▲     │   contents of the hash
+///     table       │       HashJoin │          │  HashJoin      │          table
+///                 └──────┼─────────┘          └──────────┼─────┘
+///             ─ ─ ─ ─ ─ ─                                 ─ ─ ─ ─ ─ ─ ─
+///            │                                                         │
+///
+///            │                                                         │
+///     ┌────────────┐                                            ┌────────────┐
+///     │RecordBatch │                                            │RecordBatch │
+///     └────────────┘                                            └────────────┘
+///     ┌────────────┐                                            ┌────────────┐
+///     │RecordBatch │                                            │RecordBatch │
+///     └────────────┘                                            └────────────┘
+///           ...                                                       ...
+///     ┌────────────┐                                            ┌────────────┐
+///     │RecordBatch │                                            │RecordBatch │
+///     └────────────┘                                            └────────────┘
+///
+///        build side                                                probe side
+///
+/// ```
+///
+/// # Example "Optimal" Plans
+///
+/// The differences in the inputs means that for classic "Star Schema Query",
+/// the optimal plan will be a **"Right Deep Tree"** . A Star Schema Query is
+/// one where there is one large table and several smaller "dimension" tables,
+/// joined on `Foreign Key = Primary Key` predicates.
+///
+/// A "Right Deep Tree" looks like this large table as the probe side on the
+/// lowest join:
+///
+/// ```text
+///             ┌───────────┐
+///             │ HashJoin  │
+///             │           │
+///             └───────────┘
+///                 │   │
+///         ┌───────┘   └──────────┐
+///         ▼                      ▼
+/// ┌───────────────┐        ┌───────────┐
+/// │ small table 1 │        │ HashJoin  │
+/// │  "dimension"  │        │           │
+/// └───────────────┘        └───┬───┬───┘
+///                   ┌──────────┘   └───────┐
+///                   │                      │
+///                   ▼                      ▼
+///           ┌───────────────┐        ┌───────────┐
+///           │ small table 2 │        │ HashJoin  │
+///           │  "dimension"  │        │           │
+///           └───────────────┘        └───┬───┬───┘
+///                               ┌────────┘   └────────┐
+///                               │                     │
+///                               ▼                     ▼
+///                       ┌───────────────┐     ┌───────────────┐
+///                       │ small table 3 │     │  large table  │
+///                       │  "dimension"  │     │    "fact"     │
+///                       └───────────────┘     └───────────────┘
+/// ```
 #[derive(Debug)]
 pub struct HashJoinExec {
     /// left (build) side which gets hashed
     pub left: Arc<dyn ExecutionPlan>,
     /// right (probe) side which are filtered by the hash table
     pub right: Arc<dyn ExecutionPlan>,
-    /// Set of common columns used to join on
+    /// Set of equijoin columns from the relations: `(left_col, right_col)`
     pub on: Vec<(Column, Column)>,
     /// Filters which are applied while finding matching rows
     pub filter: Option<JoinFilter>,
-    /// How the join is performed
+    /// How the join is performed (`OUTER`, `INNER`, etc)
     pub join_type: JoinType,
-    /// The schema once the join is applied
+    /// The output schema for the join
     schema: SchemaRef,
     /// Build-side data
     left_fut: OnceAsync<JoinLeftData>,
-    /// Shares the `RandomState` for the hashing algorithm
+    /// Shared the `RandomState` for the hashing algorithm
     random_state: RandomState,
     /// Output order
     output_order: Option<Vec<PhysicalSortExpr>>,
@@ -110,12 +220,16 @@ pub struct HashJoinExec {
     metrics: ExecutionPlanMetricsSet,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
-    /// If null_equals_null is true, null == null else null != null
+    /// Null matching behavior: If `null_equals_null` is true, rows that have
+    /// `null`s in both left and right equijoin columns will be matched.
+    /// Otherwise, rows that have `null`s in the join columns will not be
+    /// matched and thus will not appear in the output.
     pub null_equals_null: bool,
 }
 
 impl HashJoinExec {
     /// Tries to create a new [HashJoinExec].
+    ///
     /// # Error
     /// This function errors when it is not possible to join the left and right sides on keys `on`.
     pub fn try_new(

From 9b45967edc6dba312ea223464dad3e66604d2095 Mon Sep 17 00:00:00 2001
From: Alex Huang <huangweijun1001@gmail.com>
Date: Sun, 29 Oct 2023 22:45:34 +0100
Subject: [PATCH 32/32] chore: clean useless clone baesd on clippy (#7973)

---
 .../core/src/physical_optimizer/combine_partial_final_agg.rs    | 2 +-
 datafusion/core/src/physical_optimizer/enforce_distribution.rs  | 2 +-
 datafusion/core/src/physical_optimizer/topk_aggregation.rs      | 2 +-
 datafusion/physical-expr/src/array_expressions.rs               | 2 +-
 datafusion/substrait/src/logical_plan/producer.rs               | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs
index 838ae613683e..2c4e929788df 100644
--- a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs
+++ b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs
@@ -93,7 +93,7 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
                                             input_agg_exec.filter_expr().to_vec(),
                                             input_agg_exec.order_by_expr().to_vec(),
                                             input_agg_exec.input().clone(),
-                                            input_agg_exec.input_schema().clone(),
+                                            input_agg_exec.input_schema(),
                                         )
                                         .ok()
                                         .map(Arc::new)
diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index 7b91dce32aa9..12df9efbbca6 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -554,7 +554,7 @@ fn reorder_aggregate_keys(
                         agg_exec.filter_expr().to_vec(),
                         agg_exec.order_by_expr().to_vec(),
                         partial_agg,
-                        agg_exec.input_schema().clone(),
+                        agg_exec.input_schema(),
                     )?);
 
                     // Need to create a new projection to change the expr ordering back
diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs
index 572e796a8ba7..e0a8da82e35f 100644
--- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs
+++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs
@@ -75,7 +75,7 @@ impl TopKAggregation {
             aggr.filter_expr().to_vec(),
             aggr.order_by_expr().to_vec(),
             aggr.input().clone(),
-            aggr.input_schema().clone(),
+            aggr.input_schema(),
         )
         .expect("Unable to copy Aggregate!")
         .with_limit(Some(limit));
diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs
index 7077f8b59860..84fd301b84de 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -811,7 +811,7 @@ fn concat_internal(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
     }
     // Assume all arrays have the same data type
-    let data_type = list_arrays[0].value_type().clone();
+    let data_type = list_arrays[0].value_type();
     let buffer = valid.finish();
 
     let elements = arrays
diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs
index 757bddf9fe58..e3c6f94d43d5 100644
--- a/datafusion/substrait/src/logical_plan/producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer.rs
@@ -326,7 +326,7 @@ pub fn to_substrait_rel(
                     left: Some(left),
                     right: Some(right),
                     r#type: join_type as i32,
-                    expression: join_expr.clone(),
+                    expression: join_expr,
                     post_join_filter: None,
                     advanced_extension: None,
                 }))),