From 34906d63f82c65b70f232b49a532bf649840a453 Mon Sep 17 00:00:00 2001 From: Nick Salerni Date: Wed, 21 Feb 2024 21:24:00 -0800 Subject: [PATCH] [FEAT] Add str.upper() function * Adding the `upper` function to match https://ibis-project.org/reference/expression-strings#ibis.expr.types.strings.StringValue.upper * Added tests showing example usage * Refactor tests for str.lower to be a single parameterized test Closes #1920 --- daft/daft.pyi | 2 + daft/expressions/expressions.py | 11 +++++ daft/series.py | 4 ++ docs/source/api_docs/expressions.rst | 1 + src/daft-core/src/array/ops/utf8.rs | 15 +++++- src/daft-core/src/python/series.rs | 4 ++ src/daft-core/src/series/ops/utf8.rs | 10 ++++ src/daft-dsl/src/functions/utf8/mod.rs | 11 +++++ src/daft-dsl/src/functions/utf8/upper.rs | 46 ++++++++++++++++++ src/daft-dsl/src/python.rs | 5 ++ tests/expressions/typing/test_str.py | 10 ++++ tests/series/test_utf8_ops.py | 60 ++++++++++++++---------- tests/table/utf8/test_upper.py | 10 ++++ 13 files changed, 163 insertions(+), 26 deletions(-) create mode 100644 src/daft-dsl/src/functions/utf8/upper.rs create mode 100644 tests/table/utf8/test_upper.py diff --git a/daft/daft.pyi b/daft/daft.pyi index 46fc5d9448..fc2ddbd8ac 100644 --- a/daft/daft.pyi +++ b/daft/daft.pyi @@ -910,6 +910,7 @@ class PyExpr: def utf8_split(self, pattern: PyExpr) -> PyExpr: ... def utf8_length(self) -> PyExpr: ... def utf8_lower(self) -> PyExpr: ... + def utf8_upper(self) -> PyExpr: ... def image_decode(self) -> PyExpr: ... def image_encode(self, image_format: ImageFormat) -> PyExpr: ... def image_resize(self, w: int, h: int) -> PyExpr: ... @@ -986,6 +987,7 @@ class PySeries: def utf8_split(self, pattern: PySeries) -> PySeries: ... def utf8_length(self) -> PySeries: ... def utf8_lower(self) -> PySeries: ... + def utf8_upper(self) -> PySeries: ... def is_nan(self) -> PySeries: ... def dt_date(self) -> PySeries: ... def dt_day(self) -> PySeries: ... diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 2901ce690d..964903a97d 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -743,6 +743,17 @@ def lower(self) -> Expression: """ return Expression._from_pyexpr(self._expr.utf8_lower()) + def upper(self) -> Expression: + """Convert UTF-8 string to all upper + + Example: + >>> col("x").str.upper() + + Returns: + Expression: a String expression which is `self` uppercased + """ + return Expression._from_pyexpr(self._expr.utf8_upper()) + class ExpressionListNamespace(ExpressionNamespace): def join(self, delimiter: str | Expression) -> Expression: diff --git a/daft/series.py b/daft/series.py index a46c0fdd86..f776dd8260 100644 --- a/daft/series.py +++ b/daft/series.py @@ -594,6 +594,10 @@ def lower(self) -> Series: assert self._series is not None return Series._from_pyseries(self._series.utf8_lower()) + def upper(self) -> Series: + assert self._series is not None + return Series._from_pyseries(self._series.utf8_upper()) + class SeriesDateNamespace(SeriesNamespace): def date(self) -> Series: diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst index d89f6c79ea..f0d5002846 100644 --- a/docs/source/api_docs/expressions.rst +++ b/docs/source/api_docs/expressions.rst @@ -99,6 +99,7 @@ The following methods are available under the ``expr.str`` attribute. Expression.str.length Expression.str.split Expression.str.lower + Expression.str.upper .. _api-expressions-temporal: diff --git a/src/daft-core/src/array/ops/utf8.rs b/src/daft-core/src/array/ops/utf8.rs index efc135532c..5f6e3eb127 100644 --- a/src/daft-core/src/array/ops/utf8.rs +++ b/src/daft-core/src/array/ops/utf8.rs @@ -3,7 +3,7 @@ use crate::{ datatypes::{BooleanArray, Field, UInt64Array, Utf8Array}, DataType, Series, }; -use arrow2::{self}; +use arrow2; use common_error::{DaftError, DaftResult}; @@ -159,6 +159,19 @@ impl Utf8Array { Ok(Utf8Array::from((self.name(), Box::new(arrow_result)))) } + pub fn upper(&self) -> DaftResult { + let self_arrow = self.as_arrow(); + let arrow_result = self_arrow + .iter() + .map(|val| { + let v = val?; + Some(v.to_uppercase()) + }) + .collect::>() + .with_validity(self_arrow.validity().cloned()); + Ok(Utf8Array::from((self.name(), Box::new(arrow_result)))) + } + fn binary_broadcasted_compare( &self, other: &Self, diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index dc6eb2c5b4..78d4dd9997 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -268,6 +268,10 @@ impl PySeries { Ok(self.series.utf8_lower()?.into()) } + pub fn utf8_upper(&self) -> PyResult { + Ok(self.series.utf8_upper()?.into()) + } + pub fn is_nan(&self) -> PyResult { Ok(self.series.is_nan()?.into()) } diff --git a/src/daft-core/src/series/ops/utf8.rs b/src/daft-core/src/series/ops/utf8.rs index 951ff6b0d7..d29edafd41 100644 --- a/src/daft-core/src/series/ops/utf8.rs +++ b/src/daft-core/src/series/ops/utf8.rs @@ -60,4 +60,14 @@ impl Series { ))), } } + + pub fn utf8_upper(&self) -> DaftResult { + match self.data_type() { + DataType::Utf8 => Ok(self.utf8()?.upper()?.into_series()), + DataType::Null => Ok(self.clone()), + dt => Err(DaftError::TypeError(format!( + "Upper not implemented for type {dt}" + ))), + } + } } diff --git a/src/daft-dsl/src/functions/utf8/mod.rs b/src/daft-dsl/src/functions/utf8/mod.rs index 967def3cfa..c4789ff212 100644 --- a/src/daft-dsl/src/functions/utf8/mod.rs +++ b/src/daft-dsl/src/functions/utf8/mod.rs @@ -4,6 +4,7 @@ mod length; mod lower; mod split; mod startswith; +mod upper; use contains::ContainsEvaluator; use endswith::EndswithEvaluator; @@ -12,6 +13,7 @@ use lower::LowerEvaluator; use serde::{Deserialize, Serialize}; use split::SplitEvaluator; use startswith::StartswithEvaluator; +use upper::UpperEvaluator; use crate::Expr; @@ -25,6 +27,7 @@ pub enum Utf8Expr { Split, Length, Lower, + Upper, } impl Utf8Expr { @@ -38,6 +41,7 @@ impl Utf8Expr { Split => &SplitEvaluator {}, Length => &LengthEvaluator {}, Lower => &LowerEvaluator {}, + Upper => &UpperEvaluator {}, } } } @@ -83,3 +87,10 @@ pub fn lower(data: &Expr) -> Expr { inputs: vec![data.clone()], } } + +pub fn upper(data: &Expr) -> Expr { + Expr::Function { + func: super::FunctionExpr::Utf8(Utf8Expr::Upper), + inputs: vec![data.clone()], + } +} diff --git a/src/daft-dsl/src/functions/utf8/upper.rs b/src/daft-dsl/src/functions/utf8/upper.rs new file mode 100644 index 0000000000..6c7967561e --- /dev/null +++ b/src/daft-dsl/src/functions/utf8/upper.rs @@ -0,0 +1,46 @@ +use daft_core::{ + datatypes::{DataType, Field}, + schema::Schema, + series::Series, +}; + +use crate::Expr; +use common_error::{DaftError, DaftResult}; + +use super::super::FunctionEvaluator; + +pub(super) struct UpperEvaluator {} + +impl FunctionEvaluator for UpperEvaluator { + fn fn_name(&self) -> &'static str { + "upper" + } + + fn to_field(&self, inputs: &[Expr], schema: &Schema, _: &Expr) -> DaftResult { + match inputs { + [data] => match data.to_field(schema) { + Ok(data_field) => match &data_field.dtype { + DataType::Utf8 => Ok(Field::new(data_field.name, DataType::Utf8)), + _ => Err(DaftError::TypeError(format!( + "Expects input to upper to be utf8, but received {data_field}", + ))), + }, + Err(e) => Err(e), + }, + _ => Err(DaftError::SchemaMismatch(format!( + "Expected 1 input args, got {}", + inputs.len() + ))), + } + } + + fn evaluate(&self, inputs: &[Series], _: &Expr) -> DaftResult { + match inputs { + [data] => data.utf8_upper(), + _ => Err(DaftError::ValueError(format!( + "Expected 1 input args, got {}", + inputs.len() + ))), + } + } +} diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs index fdd8c09401..f61583f411 100644 --- a/src/daft-dsl/src/python.rs +++ b/src/daft-dsl/src/python.rs @@ -338,6 +338,11 @@ impl PyExpr { Ok(lower(&self.expr).into()) } + pub fn utf8_upper(&self) -> PyResult { + use crate::functions::utf8::upper; + Ok(upper(&self.expr).into()) + } + pub fn image_decode(&self) -> PyResult { use crate::functions::image::decode; Ok(decode(&self.expr).into()) diff --git a/tests/expressions/typing/test_str.py b/tests/expressions/typing/test_str.py index da31ecbfff..1018dcff4e 100644 --- a/tests/expressions/typing/test_str.py +++ b/tests/expressions/typing/test_str.py @@ -53,3 +53,13 @@ def test_str_lower(): run_kernel=s.str.lower, resolvable=True, ) + + +def test_str_upper(): + s = Series.from_arrow(pa.array(["Foo", "BarBaz", "quux"]), name="arg") + assert_typing_resolve_vs_runtime_behavior( + data=[s], + expr=col(s.name()).str.upper(), + run_kernel=s.str.lower, + resolvable=True, + ) diff --git a/tests/series/test_utf8_ops.py b/tests/series/test_utf8_ops.py index 203f584afb..e1975f0359 100644 --- a/tests/series/test_utf8_ops.py +++ b/tests/series/test_utf8_ops.py @@ -224,31 +224,41 @@ def test_series_utf8_length_all_null() -> None: assert result.to_pylist() == [None, None, None] -def test_series_utf8_lower() -> None: - s = Series.from_arrow(pa.array(["Foo", "BarBaz", "QUUX"])) - result = s.str.lower() - assert result.to_pylist() == ["foo", "barbaz", "quux"] - - -def test_series_utf8_lower_with_nulls() -> None: - s = Series.from_arrow(pa.array(["Foo", None, "BarBaz", "QUUX"])) - result = s.str.lower() - assert result.to_pylist() == ["foo", None, "barbaz", "quux"] - - -def test_series_utf8_lower_empty() -> None: - s = Series.from_arrow(pa.array([], type=pa.string())) - result = s.str.lower() - assert result.to_pylist() == [] - - -def test_series_utf8_lower_all_null() -> None: - s = Series.from_arrow(pa.array([None, None, None])) +@pytest.mark.parametrize( + ["data", "expected"], + [ + (["Foo", "BarBaz", "QUUX"], ["foo", "barbaz", "quux"]), + # With at least one null + (["Foo", None, "BarBaz", "QUUX"], ["foo", None, "barbaz", "quux"]), + # With all nulls + ([None] * 4, [None] * 4), + # With at least one numeric strings + (["Foo", "BarBaz", "QUUX", "2"], ["foo", "barbaz", "quux", "2"]), + # With all numeric strings + (["1", "2", "3"], ["1", "2", "3"]), + ], +) +def test_series_utf8_lower(data, expected) -> None: + s = Series.from_arrow(pa.array(data)) result = s.str.lower() - assert result.to_pylist() == [None, None, None] + assert result.to_pylist() == expected -def test_series_utf8_lower_all_numeric_strs() -> None: - s = Series.from_arrow(pa.array(["1", "2", "3"])) - result = s.str.lower() - assert result.to_pylist() == ["1", "2", "3"] +@pytest.mark.parametrize( + ["data", "expected"], + [ + (["Foo", "BarBaz", "quux"], ["FOO", "BARBAZ", "QUUX"]), + # With at least one null + (["Foo", None, "BarBaz", "quux"], ["FOO", None, "BARBAZ", "QUUX"]), + # With all nulls + ([None] * 4, [None] * 4), + # With at least one numeric strings + (["Foo", "BarBaz", "quux", "2"], ["FOO", "BARBAZ", "QUUX", "2"]), + # With all numeric strings + (["1", "2", "3"], ["1", "2", "3"]), + ], +) +def test_series_utf8_upper(data, expected) -> None: + s = Series.from_arrow(pa.array(data)) + result = s.str.upper() + assert result.to_pylist() == expected diff --git a/tests/table/utf8/test_upper.py b/tests/table/utf8/test_upper.py new file mode 100644 index 0000000000..812afdf7d3 --- /dev/null +++ b/tests/table/utf8/test_upper.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from daft.expressions import col +from daft.table import MicroPartition + + +def test_utf8_upper(): + table = MicroPartition.from_pydict({"col": ["Foo", None, "BarBaz", "quux", "1"]}) + result = table.eval_expression_list([col("col").str.upper()]) + assert result.to_pydict() == {"col": ["FOO", None, "BARBAZ", "QUUX", "1"]}