Eventual-Inc · samster25 · Feb 22, 2024 · Feb 22, 2024 · samster25 · Feb 22, 2024
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -910,6 +910,7 @@ class PyExpr:
     def utf8_split(self, pattern: PyExpr) -> PyExpr: ...
     def utf8_length(self) -> PyExpr: ...
     def utf8_lower(self) -> PyExpr: ...
+    def utf8_upper(self) -> PyExpr: ...
     def image_decode(self) -> PyExpr: ...
     def image_encode(self, image_format: ImageFormat) -> PyExpr: ...
     def image_resize(self, w: int, h: int) -> PyExpr: ...
@@ -986,6 +987,7 @@ class PySeries:
     def utf8_split(self, pattern: PySeries) -> PySeries: ...
     def utf8_length(self) -> PySeries: ...
     def utf8_lower(self) -> PySeries: ...
+    def utf8_upper(self) -> PySeries: ...
     def is_nan(self) -> PySeries: ...
     def dt_date(self) -> PySeries: ...
     def dt_day(self) -> PySeries: ...

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -743,6 +743,17 @@ def lower(self) -> Expression:
         """
         return Expression._from_pyexpr(self._expr.utf8_lower())
 
+    def upper(self) -> Expression:
+        """Convert UTF-8 string to all upper
+
+        Example:
+            >>> col("x").str.upper()
+
+        Returns:
+            Expression: a String expression which is `self` uppercased
+        """
+        return Expression._from_pyexpr(self._expr.utf8_upper())
+
 
 class ExpressionListNamespace(ExpressionNamespace):
     def join(self, delimiter: str | Expression) -> Expression:

diff --git a/daft/series.py b/daft/series.py
@@ -594,6 +594,10 @@ def lower(self) -> Series:
         assert self._series is not None
         return Series._from_pyseries(self._series.utf8_lower())
 
+    def upper(self) -> Series:
+        assert self._series is not None
+        return Series._from_pyseries(self._series.utf8_upper())
+
 
 class SeriesDateNamespace(SeriesNamespace):
     def date(self) -> Series:

diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst
@@ -99,6 +99,7 @@ The following methods are available under the ``expr.str`` attribute.
    Expression.str.length
    Expression.str.split
    Expression.str.lower
+   Expression.str.upper
 
 .. _api-expressions-temporal:
 

diff --git a/src/daft-core/src/array/ops/utf8.rs b/src/daft-core/src/array/ops/utf8.rs
@@ -3,7 +3,7 @@ use crate::{
     datatypes::{BooleanArray, Field, UInt64Array, Utf8Array},
     DataType, Series,
 };
-use arrow2::{self};
+use arrow2;
 
 use common_error::{DaftError, DaftResult};
 
@@ -159,6 +159,19 @@ impl Utf8Array {
         Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
     }
 
+    pub fn upper(&self) -> DaftResult<Utf8Array> {
+        let self_arrow = self.as_arrow();
+        let arrow_result = self_arrow
+            .iter()
+            .map(|val| {
+                let v = val?;
+                Some(v.to_uppercase())
+            })
+            .collect::<arrow2::array::Utf8Array<i64>>()
+            .with_validity(self_arrow.validity().cloned());
+        Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
+    }
+
     fn binary_broadcasted_compare<ScalarKernel>(
         &self,
         other: &Self,

diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs
@@ -268,6 +268,10 @@ impl PySeries {
         Ok(self.series.utf8_lower()?.into())
     }
 
+    pub fn utf8_upper(&self) -> PyResult<Self> {
+        Ok(self.series.utf8_upper()?.into())
+    }
+
     pub fn is_nan(&self) -> PyResult<Self> {
         Ok(self.series.is_nan()?.into())
     }

diff --git a/src/daft-core/src/series/ops/utf8.rs b/src/daft-core/src/series/ops/utf8.rs
@@ -60,4 +60,14 @@ impl Series {
             ))),
         }
     }
+
+    pub fn utf8_upper(&self) -> DaftResult<Series> {
+        match self.data_type() {
+            DataType::Utf8 => Ok(self.utf8()?.upper()?.into_series()),
+            DataType::Null => Ok(self.clone()),
+            dt => Err(DaftError::TypeError(format!(
+                "Upper not implemented for type {dt}"
+            ))),
+        }
+    }
 }
diff --git a/src/daft-dsl/src/functions/utf8/mod.rs b/src/daft-dsl/src/functions/utf8/mod.rs
@@ -4,6 +4,7 @@ mod length;
 mod lower;
 mod split;
 mod startswith;
+mod upper;
 
 use contains::ContainsEvaluator;
 use endswith::EndswithEvaluator;
@@ -12,6 +13,7 @@ use lower::LowerEvaluator;
 use serde::{Deserialize, Serialize};
 use split::SplitEvaluator;
 use startswith::StartswithEvaluator;
+use upper::UpperEvaluator;
 
 use crate::Expr;
 
@@ -25,6 +27,7 @@ pub enum Utf8Expr {
     Split,
     Length,
     Lower,
+    Upper,
 }
 
 impl Utf8Expr {
@@ -38,6 +41,7 @@ impl Utf8Expr {
             Split => &SplitEvaluator {},
             Length => &LengthEvaluator {},
             Lower => &LowerEvaluator {},
+            Upper => &UpperEvaluator {},
         }
     }
 }
@@ -83,3 +87,10 @@ pub fn lower(data: &Expr) -> Expr {
         inputs: vec![data.clone()],
     }
 }
+
+pub fn upper(data: &Expr) -> Expr {
+    Expr::Function {
+        func: super::FunctionExpr::Utf8(Utf8Expr::Upper),
+        inputs: vec![data.clone()],
+    }
+}
diff --git a/src/daft-dsl/src/functions/utf8/upper.rs b/src/daft-dsl/src/functions/utf8/upper.rs
@@ -0,0 +1,46 @@
+use daft_core::{
+    datatypes::{DataType, Field},
+    schema::Schema,
+    series::Series,
+};
+
+use crate::Expr;
+use common_error::{DaftError, DaftResult};
+
+use super::super::FunctionEvaluator;
+
+pub(super) struct UpperEvaluator {}
+
+impl FunctionEvaluator for UpperEvaluator {
+    fn fn_name(&self) -> &'static str {
+        "upper"
+    }
+
+    fn to_field(&self, inputs: &[Expr], schema: &Schema, _: &Expr) -> DaftResult<Field> {
+        match inputs {
+            [data] => match data.to_field(schema) {
+                Ok(data_field) => match &data_field.dtype {
+                    DataType::Utf8 => Ok(Field::new(data_field.name, DataType::Utf8)),
+                    _ => Err(DaftError::TypeError(format!(
+                        "Expects input to upper to be utf8, but received {data_field}",
+                    ))),
+                },
+                Err(e) => Err(e),
+            },
+            _ => Err(DaftError::SchemaMismatch(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+
+    fn evaluate(&self, inputs: &[Series], _: &Expr) -> DaftResult<Series> {
+        match inputs {
+            [data] => data.utf8_upper(),
+            _ => Err(DaftError::ValueError(format!(
+                "Expected 1 input args, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+}
diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs
@@ -338,6 +338,11 @@ impl PyExpr {
         Ok(lower(&self.expr).into())
     }
 
+    pub fn utf8_upper(&self) -> PyResult<Self> {
+        use crate::functions::utf8::upper;
+        Ok(upper(&self.expr).into())
+    }
+
     pub fn image_decode(&self) -> PyResult<Self> {
         use crate::functions::image::decode;
         Ok(decode(&self.expr).into())

diff --git a/tests/expressions/typing/test_str.py b/tests/expressions/typing/test_str.py
@@ -53,3 +53,13 @@ def test_str_lower():
         run_kernel=s.str.lower,
         resolvable=True,
     )
+
+
+def test_str_upper():
+    s = Series.from_arrow(pa.array(["Foo", "BarBaz", "quux"]), name="arg")
+    assert_typing_resolve_vs_runtime_behavior(
+        data=[s],
+        expr=col(s.name()).str.upper(),
+        run_kernel=s.str.lower,
+        resolvable=True,
+    )
diff --git a/tests/series/test_utf8_ops.py b/tests/series/test_utf8_ops.py
@@ -224,31 +224,41 @@ def test_series_utf8_length_all_null() -> None:
     assert result.to_pylist() == [None, None, None]
 
 
-def test_series_utf8_lower() -> None:
-    s = Series.from_arrow(pa.array(["Foo", "BarBaz", "QUUX"]))
-    result = s.str.lower()
-    assert result.to_pylist() == ["foo", "barbaz", "quux"]
-
-
-def test_series_utf8_lower_with_nulls() -> None:
-    s = Series.from_arrow(pa.array(["Foo", None, "BarBaz", "QUUX"]))
-    result = s.str.lower()
-    assert result.to_pylist() == ["foo", None, "barbaz", "quux"]
-
-
-def test_series_utf8_lower_empty() -> None:
-    s = Series.from_arrow(pa.array([], type=pa.string()))
-    result = s.str.lower()
-    assert result.to_pylist() == []
-
-
-def test_series_utf8_lower_all_null() -> None:
-    s = Series.from_arrow(pa.array([None, None, None]))
+@pytest.mark.parametrize(
+    ["data", "expected"],
+    [
+        (["Foo", "BarBaz", "QUUX"], ["foo", "barbaz", "quux"]),
+        # With at least one null
+        (["Foo", None, "BarBaz", "QUUX"], ["foo", None, "barbaz", "quux"]),
+        # With all nulls
+        ([None] * 4, [None] * 4),
+        # With at least one numeric strings
+        (["Foo", "BarBaz", "QUUX", "2"], ["foo", "barbaz", "quux", "2"]),
+        # With all numeric strings
+        (["1", "2", "3"], ["1", "2", "3"]),
+    ],
+)
+def test_series_utf8_lower(data, expected) -> None:
+    s = Series.from_arrow(pa.array(data))
     result = s.str.lower()
-    assert result.to_pylist() == [None, None, None]
+    assert result.to_pylist() == expected
 
 
-def test_series_utf8_lower_all_numeric_strs() -> None:
-    s = Series.from_arrow(pa.array(["1", "2", "3"]))
-    result = s.str.lower()
-    assert result.to_pylist() == ["1", "2", "3"]
+@pytest.mark.parametrize(
+    ["data", "expected"],
+    [
+        (["Foo", "BarBaz", "quux"], ["FOO", "BARBAZ", "QUUX"]),
+        # With at least one null
+        (["Foo", None, "BarBaz", "quux"], ["FOO", None, "BARBAZ", "QUUX"]),
+        # With all nulls
+        ([None] * 4, [None] * 4),
+        # With at least one numeric strings
+        (["Foo", "BarBaz", "quux", "2"], ["FOO", "BARBAZ", "QUUX", "2"]),
+        # With all numeric strings
+        (["1", "2", "3"], ["1", "2", "3"]),
+    ],
+)
+def test_series_utf8_upper(data, expected) -> None:
+    s = Series.from_arrow(pa.array(data))
+    result = s.str.upper()
+    assert result.to_pylist() == expected
diff --git a/tests/table/utf8/test_upper.py b/tests/table/utf8/test_upper.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+from daft.expressions import col
+from daft.table import MicroPartition
+
+
+def test_utf8_upper():
+    table = MicroPartition.from_pydict({"col": ["Foo", None, "BarBaz", "quux", "1"]})
+    result = table.eval_expression_list([col("col").str.upper()])
+    assert result.to_pydict() == {"col": ["FOO", None, "BARBAZ", "QUUX", "1"]}