Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Add str.upper() function #1942

Merged
merged 1 commit into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions daft/daft.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,7 @@ class PyExpr:
def utf8_split(self, pattern: PyExpr) -> PyExpr: ...
def utf8_length(self) -> PyExpr: ...
def utf8_lower(self) -> PyExpr: ...
def utf8_upper(self) -> PyExpr: ...
def image_decode(self) -> PyExpr: ...
def image_encode(self, image_format: ImageFormat) -> PyExpr: ...
def image_resize(self, w: int, h: int) -> PyExpr: ...
Expand Down Expand Up @@ -986,6 +987,7 @@ class PySeries:
def utf8_split(self, pattern: PySeries) -> PySeries: ...
def utf8_length(self) -> PySeries: ...
def utf8_lower(self) -> PySeries: ...
def utf8_upper(self) -> PySeries: ...
def is_nan(self) -> PySeries: ...
def dt_date(self) -> PySeries: ...
def dt_day(self) -> PySeries: ...
Expand Down
11 changes: 11 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,17 @@ def lower(self) -> Expression:
"""
return Expression._from_pyexpr(self._expr.utf8_lower())

def upper(self) -> Expression:
"""Convert UTF-8 string to all upper

Example:
>>> col("x").str.upper()

Returns:
Expression: a String expression which is `self` uppercased
"""
return Expression._from_pyexpr(self._expr.utf8_upper())


class ExpressionListNamespace(ExpressionNamespace):
def join(self, delimiter: str | Expression) -> Expression:
Expand Down
4 changes: 4 additions & 0 deletions daft/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,10 @@ def lower(self) -> Series:
assert self._series is not None
return Series._from_pyseries(self._series.utf8_lower())

def upper(self) -> Series:
assert self._series is not None
return Series._from_pyseries(self._series.utf8_upper())


class SeriesDateNamespace(SeriesNamespace):
def date(self) -> Series:
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_docs/expressions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ The following methods are available under the ``expr.str`` attribute.
Expression.str.length
Expression.str.split
Expression.str.lower
Expression.str.upper

.. _api-expressions-temporal:

Expand Down
15 changes: 14 additions & 1 deletion src/daft-core/src/array/ops/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::{
datatypes::{BooleanArray, Field, UInt64Array, Utf8Array},
DataType, Series,
};
use arrow2::{self};
use arrow2;

use common_error::{DaftError, DaftResult};

Expand Down Expand Up @@ -159,6 +159,19 @@ impl Utf8Array {
Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
}

pub fn upper(&self) -> DaftResult<Utf8Array> {
let self_arrow = self.as_arrow();
let arrow_result = self_arrow
.iter()
.map(|val| {
let v = val?;
Some(v.to_uppercase())
})
.collect::<arrow2::array::Utf8Array<i64>>()
.with_validity(self_arrow.validity().cloned());
Ok(Utf8Array::from((self.name(), Box::new(arrow_result))))
}

fn binary_broadcasted_compare<ScalarKernel>(
&self,
other: &Self,
Expand Down
4 changes: 4 additions & 0 deletions src/daft-core/src/python/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ impl PySeries {
Ok(self.series.utf8_lower()?.into())
}

pub fn utf8_upper(&self) -> PyResult<Self> {
Ok(self.series.utf8_upper()?.into())
}

pub fn is_nan(&self) -> PyResult<Self> {
Ok(self.series.is_nan()?.into())
}
Expand Down
10 changes: 10 additions & 0 deletions src/daft-core/src/series/ops/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,14 @@ impl Series {
))),
}
}

pub fn utf8_upper(&self) -> DaftResult<Series> {
match self.data_type() {
DataType::Utf8 => Ok(self.utf8()?.upper()?.into_series()),
DataType::Null => Ok(self.clone()),
dt => Err(DaftError::TypeError(format!(
"Upper not implemented for type {dt}"
))),
}
}
}
11 changes: 11 additions & 0 deletions src/daft-dsl/src/functions/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod length;
mod lower;
mod split;
mod startswith;
mod upper;

use contains::ContainsEvaluator;
use endswith::EndswithEvaluator;
Expand All @@ -12,6 +13,7 @@ use lower::LowerEvaluator;
use serde::{Deserialize, Serialize};
use split::SplitEvaluator;
use startswith::StartswithEvaluator;
use upper::UpperEvaluator;

use crate::Expr;

Expand All @@ -25,6 +27,7 @@ pub enum Utf8Expr {
Split,
Length,
Lower,
Upper,
}

impl Utf8Expr {
Expand All @@ -38,6 +41,7 @@ impl Utf8Expr {
Split => &SplitEvaluator {},
Length => &LengthEvaluator {},
Lower => &LowerEvaluator {},
Upper => &UpperEvaluator {},
}
}
}
Expand Down Expand Up @@ -83,3 +87,10 @@ pub fn lower(data: &Expr) -> Expr {
inputs: vec![data.clone()],
}
}

pub fn upper(data: &Expr) -> Expr {
Expr::Function {
func: super::FunctionExpr::Utf8(Utf8Expr::Upper),
inputs: vec![data.clone()],
}
}
46 changes: 46 additions & 0 deletions src/daft-dsl/src/functions/utf8/upper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use daft_core::{
datatypes::{DataType, Field},
schema::Schema,
series::Series,
};

use crate::Expr;
use common_error::{DaftError, DaftResult};

use super::super::FunctionEvaluator;

pub(super) struct UpperEvaluator {}

impl FunctionEvaluator for UpperEvaluator {
fn fn_name(&self) -> &'static str {
"upper"
}

fn to_field(&self, inputs: &[Expr], schema: &Schema, _: &Expr) -> DaftResult<Field> {
match inputs {
[data] => match data.to_field(schema) {
Ok(data_field) => match &data_field.dtype {
DataType::Utf8 => Ok(Field::new(data_field.name, DataType::Utf8)),
_ => Err(DaftError::TypeError(format!(
"Expects input to upper to be utf8, but received {data_field}",
))),
},
Err(e) => Err(e),
},
_ => Err(DaftError::SchemaMismatch(format!(
"Expected 1 input args, got {}",
inputs.len()
))),
}
}

fn evaluate(&self, inputs: &[Series], _: &Expr) -> DaftResult<Series> {
match inputs {
[data] => data.utf8_upper(),
_ => Err(DaftError::ValueError(format!(
"Expected 1 input args, got {}",
inputs.len()
))),
}
}
}
5 changes: 5 additions & 0 deletions src/daft-dsl/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ impl PyExpr {
Ok(lower(&self.expr).into())
}

pub fn utf8_upper(&self) -> PyResult<Self> {
use crate::functions::utf8::upper;
Ok(upper(&self.expr).into())
}

pub fn image_decode(&self) -> PyResult<Self> {
use crate::functions::image::decode;
Ok(decode(&self.expr).into())
Expand Down
10 changes: 10 additions & 0 deletions tests/expressions/typing/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,13 @@ def test_str_lower():
run_kernel=s.str.lower,
resolvable=True,
)


def test_str_upper():
s = Series.from_arrow(pa.array(["Foo", "BarBaz", "quux"]), name="arg")
assert_typing_resolve_vs_runtime_behavior(
data=[s],
expr=col(s.name()).str.upper(),
run_kernel=s.str.lower,
resolvable=True,
)
60 changes: 35 additions & 25 deletions tests/series/test_utf8_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,31 +224,41 @@ def test_series_utf8_length_all_null() -> None:
assert result.to_pylist() == [None, None, None]


def test_series_utf8_lower() -> None:
s = Series.from_arrow(pa.array(["Foo", "BarBaz", "QUUX"]))
result = s.str.lower()
assert result.to_pylist() == ["foo", "barbaz", "quux"]


def test_series_utf8_lower_with_nulls() -> None:
s = Series.from_arrow(pa.array(["Foo", None, "BarBaz", "QUUX"]))
result = s.str.lower()
assert result.to_pylist() == ["foo", None, "barbaz", "quux"]


def test_series_utf8_lower_empty() -> None:
s = Series.from_arrow(pa.array([], type=pa.string()))
result = s.str.lower()
assert result.to_pylist() == []


def test_series_utf8_lower_all_null() -> None:
s = Series.from_arrow(pa.array([None, None, None]))
@pytest.mark.parametrize(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice, refactoring this to be parameterized!

["data", "expected"],
[
(["Foo", "BarBaz", "QUUX"], ["foo", "barbaz", "quux"]),
# With at least one null
(["Foo", None, "BarBaz", "QUUX"], ["foo", None, "barbaz", "quux"]),
# With all nulls
([None] * 4, [None] * 4),
# With at least one numeric strings
(["Foo", "BarBaz", "QUUX", "2"], ["foo", "barbaz", "quux", "2"]),
# With all numeric strings
(["1", "2", "3"], ["1", "2", "3"]),
],
)
def test_series_utf8_lower(data, expected) -> None:
s = Series.from_arrow(pa.array(data))
result = s.str.lower()
assert result.to_pylist() == [None, None, None]
assert result.to_pylist() == expected


def test_series_utf8_lower_all_numeric_strs() -> None:
s = Series.from_arrow(pa.array(["1", "2", "3"]))
result = s.str.lower()
assert result.to_pylist() == ["1", "2", "3"]
@pytest.mark.parametrize(
["data", "expected"],
[
(["Foo", "BarBaz", "quux"], ["FOO", "BARBAZ", "QUUX"]),
# With at least one null
(["Foo", None, "BarBaz", "quux"], ["FOO", None, "BARBAZ", "QUUX"]),
# With all nulls
([None] * 4, [None] * 4),
# With at least one numeric strings
(["Foo", "BarBaz", "quux", "2"], ["FOO", "BARBAZ", "QUUX", "2"]),
# With all numeric strings
(["1", "2", "3"], ["1", "2", "3"]),
],
)
def test_series_utf8_upper(data, expected) -> None:
s = Series.from_arrow(pa.array(data))
result = s.str.upper()
assert result.to_pylist() == expected
10 changes: 10 additions & 0 deletions tests/table/utf8/test_upper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations

from daft.expressions import col
from daft.table import MicroPartition


def test_utf8_upper():
table = MicroPartition.from_pydict({"col": ["Foo", None, "BarBaz", "quux", "1"]})
result = table.eval_expression_list([col("col").str.upper()])
assert result.to_pydict() == {"col": ["FOO", None, "BARBAZ", "QUUX", "1"]}
Loading