Eventual-Inc · kevinzwang · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/src/daft-dsl/src/resolve_expr/mod.rs b/src/daft-dsl/src/resolve_expr/mod.rs
@@ -3,7 +3,7 @@
 
 use std::{
     cmp::Ordering,
-    collections::{BinaryHeap, HashMap},
+    collections::{BinaryHeap, HashMap, HashSet},
     sync::Arc,
 };
 
@@ -208,7 +208,7 @@
 }
 
 /// Checks if an expression used in an aggregation is well formed.
-/// Checks if an expression used in an aggregation is well formed.
+/// Checks if an expression used in an aggregation is well-formed.
+
-/// Checks if an expression used in an aggregation is well formed.
+/// Checks if an expression used in an aggregation is well-formed.
+
-/// Expressions for aggregations must be in the form (optional) non-agg expr <- agg exprs or literals <- non-agg exprs
+/// Expressions for aggregations must be in the form (optional) non-agg expr <- [(agg exprs <- non-agg exprs) or literals or group by keys]
 ///
 /// # Examples
 ///
@@ -217,19 +217,24 @@
 /// - sum(col("a"))
 /// - sum(col("a")) > 0
 /// - sum(col("a")) - sum(col("b")) > sum(col("c"))
+/// - sum(col("a")) + col("b") when "b" is a group by key
 ///
 /// Not allowed:
-/// - col("a")
+/// - col("a") when "a" is not a group by key
 ///     - not an aggregation
-/// - sum(col("a")) + col("b")
-///     - not all branches are aggregations
-fn has_single_agg_layer(expr: &ExprRef) -> bool {
-    match expr.as_ref() {
-        Expr::Agg(agg_expr) => !agg_expr.children().iter().any(has_agg),
-        Expr::Column(_) => false,
-        Expr::Literal(_) => true,
-        _ => expr.children().iter().all(has_single_agg_layer),
-    }
+/// - sum(col("a")) + col("b") when "b" is not a group by key
+///     - not all branches are aggregations, literals, or group by keys
+fn has_single_agg_layer(expr: &ExprRef, groupby: &HashSet<ExprRef>) -> bool {
+    groupby.contains(expr)
+        || match expr.as_ref() {
+            Expr::Agg(agg_expr) => !agg_expr.children().iter().any(has_agg),
+            Expr::Column(_) => false,
+            Expr::Literal(_) => true,
+            _ => expr
+                .children()
+                .iter()
+                .all(|e| has_single_agg_layer(e, groupby)),
+        }
 }
 
 fn convert_udfs_to_map_groups(expr: &ExprRef) -> ExprRef {
@@ -257,10 +262,10 @@
     Ok(expr)
 }
 
-fn validate_expr_in_agg(expr: ExprRef) -> DaftResult<ExprRef> {
+fn validate_expr_in_agg(expr: ExprRef, groupby: &HashSet<ExprRef>) -> DaftResult<ExprRef> {
     let converted_expr = convert_udfs_to_map_groups(&expr);
 
-    if !has_single_agg_layer(&converted_expr) {
+    if !has_single_agg_layer(&converted_expr, groupby) {
         return Err(DaftError::ValueError(format!(
             "Expressions in aggregations must be composed of non-nested aggregation expressions, got {expr}"
         )));
@@ -278,6 +283,8 @@
     allow_stateful_udf: bool,
     #[builder(default)]
     in_agg_context: bool,
+    #[builder(default)]
+    groupby: HashSet<ExprRef>,
 }
 
 impl ExprResolver {
@@ -289,7 +296,7 @@
         }
 
         let validated_expr = if self.in_agg_context {
-            validate_expr_in_agg(expr)
+            validate_expr_in_agg(expr, &self.groupby)
         } else {
             validate_expr(expr)
         }?;

diff --git a/src/daft-logical-plan/src/ops/agg.rs b/src/daft-logical-plan/src/ops/agg.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 use daft_dsl::{ExprRef, ExprResolver};
 use daft_schema::schema::{Schema, SchemaRef};
@@ -37,7 +37,10 @@ impl Aggregate {
         let upstream_schema = input.schema();
 
         let groupby_resolver = ExprResolver::default();
-        let agg_resolver = ExprResolver::builder().in_agg_context(true).build();
+        let agg_resolver = ExprResolver::builder()
+            .in_agg_context(true)
+            .groupby(HashSet::from_iter(groupby.clone()))
+            .build();
 
         let (groupby, groupby_fields) = groupby_resolver
             .resolve(groupby, &upstream_schema)

diff --git a/src/daft-logical-plan/src/ops/pivot.rs b/src/daft-logical-plan/src/ops/pivot.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 use common_error::DaftError;
 use daft_core::prelude::*;
@@ -35,7 +35,10 @@ impl Pivot {
         let upstream_schema = input.schema();
 
         let expr_resolver = ExprResolver::default();
-        let agg_resolver = ExprResolver::builder().in_agg_context(true).build();
+        let agg_resolver = ExprResolver::builder()
+            .in_agg_context(true)
+            .groupby(HashSet::from_iter(group_by.clone()))
+            .build();
 
         let (group_by, group_by_fields) = expr_resolver
             .resolve(group_by, &upstream_schema)

diff --git a/tests/dataframe/test_aggregations.py b/tests/dataframe/test_aggregations.py
@@ -562,3 +562,33 @@ def test_agg_with_literal_groupby(make_df, repartition_nparts, with_morsel_size)
         "sum_plus_1": [7, 10, 13],
         "1_plus_sum": [9, 12, 15],
     }
+
+
+@pytest.mark.parametrize("repartition_nparts", [1, 2, 4])
+def test_agg_with_groupby_key_in_agg(make_df, repartition_nparts, with_morsel_size):
+    daft_df = make_df(
+        {
+            "group": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "id": [1, 2, 3, 2, 3, 4, 3, 4, 5],
+            "values": [4, 5, 6, 5, 6, 7, 6, 7, 8],
+        },
+        repartition=repartition_nparts,
+    )
+
+    daft_df = (
+        daft_df.groupby("group")
+        .agg(
+            col("group").alias("group_alias"),
+            (col("group") + 1).alias("group_plus_1"),
+            (col("id").sum() + col("group")).alias("id_plus_group"),
+        )
+        .sort("group")
+    )
+
+    res = daft_df.to_pydict()
+    assert res == {
+        "group": [1, 2, 3],
+        "group_alias": [1, 2, 3],
+        "group_plus_1": [2, 3, 4],
+        "id_plus_group": [7, 11, 15],
+    }