Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add str.normalize() #20483

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ strum_macros = "0.26"
thiserror = "2"
tokio = "1.26"
tokio-util = "0.7.8"
unicode-normalization = "0.1.24"
unicode-reverse = "1.0.8"
url = "2.4"
uuid = { version = "1.7.0", features = ["v4"] }
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ python = [
row_hash = ["polars-plan/row_hash"]
reinterpret = ["polars-plan/reinterpret", "polars-ops/reinterpret"]
string_pad = ["polars-plan/string_pad"]
string_normalize = ["polars-plan/string_normalize"]
string_reverse = ["polars-plan/string_reverse"]
string_to_integer = ["polars-plan/string_to_integer"]
arg_where = ["polars-plan/arg_where"]
Expand Down Expand Up @@ -409,6 +410,7 @@ features = [
"sign",
"streaming",
"string_encoding",
"string_normalize",
"string_pad",
"string_reverse",
"string_to_integer",
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ regex-syntax = { workspace = true }
serde = { workspace = true, optional = true }
serde_json = { workspace = true, optional = true }
strum_macros = { workspace = true }
unicode-normalization = { workspace = true, optional = true }
unicode-reverse = { workspace = true, optional = true }

[dependencies.jsonpath_lib]
Expand Down Expand Up @@ -104,6 +105,7 @@ diff = []
pct_change = ["diff"]
strings = ["polars-core/strings"]
string_pad = ["polars-core/strings"]
string_normalize = ["polars-core/strings", "unicode-normalization"]
string_reverse = ["polars-core/strings", "unicode-reverse"]
string_to_integer = ["polars-core/strings"]
extract_jsonpath = ["serde_json", "jsonpath_lib", "polars-json"]
Expand Down
4 changes: 4 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ mod find_many;
mod json_path;
#[cfg(feature = "strings")]
mod namespace;
#[cfg(feature = "string_normalize")]
mod normalize;
#[cfg(feature = "string_pad")]
mod pad;
#[cfg(feature = "string_reverse")]
Expand All @@ -37,6 +39,8 @@ pub use find_many::*;
pub use json_path::*;
#[cfg(feature = "strings")]
pub use namespace::*;
#[cfg(feature = "string_normalize")]
pub use normalize::*;
use polars_core::prelude::*;
#[cfg(feature = "strings")]
pub use split::*;
Expand Down
10 changes: 10 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use regex::escape;
use super::*;
#[cfg(feature = "binary_encoding")]
use crate::chunked_array::binary::BinaryNameSpaceImpl;
#[cfg(feature = "string_normalize")]
use crate::prelude::strings::normalize::UnicodeForm;
use crate::prelude::strings::starts_with::starts_with_str;

// We need this to infer the right lifetimes for the match closure.
Expand Down Expand Up @@ -627,6 +629,14 @@ pub trait StringNameSpaceImpl: AsString {
ca + other
}

/// Normalizes the string values
#[must_use]
#[cfg(feature = "string_normalize")]
fn str_normalize(&self, form: UnicodeForm) -> StringChunked {
let ca = self.as_string();
normalize::normalize(ca, form)
}

/// Reverses the string values
#[must_use]
#[cfg(feature = "string_reverse")]
Expand Down
38 changes: 38 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/normalize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use polars_core::prelude::{StringChunked, StringChunkedBuilder};
use unicode_normalization::UnicodeNormalization;

#[derive(Clone, Eq, PartialEq, Hash, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum UnicodeForm {
NFC,
NFKC,
NFD,
NFKD,
}

pub fn normalize_with<F: Fn(&str, &mut String)>(
ca: &StringChunked,
normalizer: F,
) -> StringChunked {
let mut buffer = String::new();
let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());
for opt_s in ca.iter() {
if let Some(s) = opt_s {
buffer.clear();
normalizer(s, &mut buffer);
builder.append_value(&buffer);
} else {
builder.append_null();
}
}
builder.finish()
}

pub fn normalize(ca: &StringChunked, form: UnicodeForm) -> StringChunked {
match form {
UnicodeForm::NFC => normalize_with(ca, |s, b| b.extend(s.nfc())),
UnicodeForm::NFKC => normalize_with(ca, |s, b| b.extend(s.nfkc())),
UnicodeForm::NFD => normalize_with(ca, |s, b| b.extend(s.nfd())),
UnicodeForm::NFKD => normalize_with(ca, |s, b| b.extend(s.nfkd())),
}
}
1 change: 1 addition & 0 deletions crates/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ array_to_struct = ["polars-ops/array_to_struct"]
row_hash = ["polars-core/row_hash", "polars-ops/hash"]
reinterpret = ["polars-core/reinterpret", "polars-ops/reinterpret"]
string_pad = ["polars-ops/string_pad"]
string_normalize = ["polars-ops/string_normalize"]
string_reverse = ["polars-ops/string_reverse"]
string_to_integer = ["polars-ops/string_to_integer"]
arg_where = []
Expand Down
16 changes: 16 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ pub enum StringFunction {
n: i64,
literal: bool,
},
#[cfg(feature = "string_normalize")]
Normalize {
form: UnicodeForm,
},
#[cfg(feature = "string_reverse")]
Reverse,
#[cfg(feature = "string_pad")]
Expand Down Expand Up @@ -165,6 +169,8 @@ impl StringFunction {
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "regex")]
Replace { .. } => mapper.with_same_dtype(),
#[cfg(feature = "string_normalize")]
Normalize { .. } => mapper.with_same_dtype(),
#[cfg(feature = "string_reverse")]
Reverse => mapper.with_same_dtype(),
#[cfg(feature = "temporal")]
Expand Down Expand Up @@ -247,6 +253,8 @@ impl Display for StringFunction {
PadStart { .. } => "pad_start",
#[cfg(feature = "regex")]
Replace { .. } => "replace",
#[cfg(feature = "string_normalize")]
Normalize { .. } => "normalize",
#[cfg(feature = "string_reverse")]
Reverse => "reverse",
#[cfg(feature = "string_encoding")]
Expand Down Expand Up @@ -363,6 +371,8 @@ impl From<StringFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
} => map_as_slice!(strings::concat_hor, &delimiter, ignore_nulls),
#[cfg(feature = "regex")]
Replace { n, literal } => map_as_slice!(strings::replace, literal, n),
#[cfg(feature = "string_normalize")]
Normalize { form } => map!(strings::normalize, form.clone()),
#[cfg(feature = "string_reverse")]
Reverse => map!(strings::reverse),
Uppercase => map!(uppercase),
Expand Down Expand Up @@ -981,6 +991,12 @@ pub(super) fn replace(s: &[Column], literal: bool, n: i64) -> PolarsResult<Colum
.map(|ca| ca.into_column())
}

#[cfg(feature = "string_normalize")]
pub(super) fn normalize(s: &Column, form: UnicodeForm) -> PolarsResult<Column> {
let ca = s.str()?;
Ok(ca.str_normalize(form).into_column())
}

#[cfg(feature = "string_reverse")]
pub(super) fn reverse(s: &Column) -> PolarsResult<Column> {
let ca = s.str()?;
Expand Down
11 changes: 11 additions & 0 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,17 @@ impl StringNameSpace {
)
}

#[cfg(feature = "string_normalize")]
/// Normalize each string
pub fn normalize(self, form: UnicodeForm) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::Normalize { form }),
&[],
false,
None,
)
}

#[cfg(feature = "string_reverse")]
/// Reverse each string
pub fn reverse(self) -> Expr {
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ features = [
"semi_anti_join",
"serde-lazy",
"string_encoding",
"string_normalize",
"string_reverse",
"string_to_integer",
"string_pad",
Expand Down Expand Up @@ -168,6 +169,7 @@ find_many = ["polars/find_many"]
new_streaming = ["polars-lazy/new_streaming"]
bitwise = ["polars/bitwise"]
approx_unique = ["polars/approx_unique"]
string_normalize = ["polars/string_normalize"]

dtype-i8 = []
dtype-i16 = []
Expand Down Expand Up @@ -219,6 +221,7 @@ operations = [
"peaks",
"hist",
"find_many",
"string_normalize",
]

io = [
Expand Down
18 changes: 18 additions & 0 deletions crates/polars-python/src/conversion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1328,3 +1328,21 @@ impl<'a> FromPyObject<'a> for PyCompatLevel {
}))
}
}

#[cfg(feature = "string_normalize")]
impl<'py> FromPyObject<'py> for Wrap<UnicodeForm> {
fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
let parsed = match &*ob.extract::<PyBackedStr>()? {
"NFC" => UnicodeForm::NFC,
"NFKC" => UnicodeForm::NFKC,
"NFD" => UnicodeForm::NFD,
"NFKD" => UnicodeForm::NFKD,
v => {
return Err(PyValueError::new_err(format!(
"`form` must be one of {{'NFC', 'NFKC', 'NFD', 'NFKD'}}, got {v}",
)))
},
};
Ok(Wrap(parsed))
}
}
4 changes: 4 additions & 0 deletions crates/polars-python/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ impl PyExpr {
.into()
}

fn str_normalize(&self, form: Wrap<UnicodeForm>) -> Self {
self.inner.clone().str().normalize(form.0).into()
}

fn str_reverse(&self) -> Self {
self.inner.clone().str().reverse().into()
}
Expand Down
13 changes: 13 additions & 0 deletions crates/polars-python/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
use polars::prelude::InequalityOperator;
use polars::series::ops::NullBehavior;
use polars_core::series::IsSorted;
#[cfg(feature = "string_normalize")]
use polars_ops::chunked_array::UnicodeForm;
use polars_ops::series::InterpolationMethod;
#[cfg(feature = "search_sorted")]
use polars_ops::series::SearchSortedSide;
Expand Down Expand Up @@ -171,6 +173,7 @@ pub enum PyStringFunction {
ContainsMany,
ReplaceMany,
EscapeRegex,
Normalize,
}

#[pymethods]
Expand Down Expand Up @@ -865,6 +868,16 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::Replace { n, literal } => {
(PyStringFunction::Replace, n, literal).into_py_any(py)
},
StringFunction::Normalize { form } => (
PyStringFunction::Normalize,
match form {
UnicodeForm::NFC => "nfc",
UnicodeForm::NFKC => "nfkc",
UnicodeForm::NFD => "nfd",
UnicodeForm::NFKD => "nfkd",
},
)
.into_py_any(py),
StringFunction::Reverse => (PyStringFunction::Reverse,).into_py_any(py),
StringFunction::PadStart { length, fill_char } => {
(PyStringFunction::PadStart, length, fill_char).into_py_any(py)
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ description = "SQL transpiler for Polars. Converts SQL to Polars logical plans"
arrow = { workspace = true }
polars-core = { workspace = true, features = ["rows"] }
polars-error = { workspace = true }
polars-lazy = { workspace = true, features = ["abs", "binary_encoding", "concat_str", "cross_join", "cum_agg", "dtype-date", "dtype-decimal", "dtype-struct", "is_in", "list_eval", "log", "meta", "regex", "round_series", "sign", "string_reverse", "strings", "timezones", "trigonometry"] }
polars-lazy = { workspace = true, features = ["abs", "binary_encoding", "concat_str", "cross_join", "cum_agg", "dtype-date", "dtype-decimal", "dtype-struct", "is_in", "list_eval", "log", "meta", "regex", "round_series", "sign", "string_normalize", "string_reverse", "strings", "timezones", "trigonometry"] }
polars-ops = { workspace = true }
polars-plan = { workspace = true }
polars-time = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ sign = ["polars-lazy?/sign"]
streaming = ["polars-lazy?/streaming"]
string_encoding = ["polars-ops/string_encoding", "polars-lazy?/string_encoding", "polars-core/strings"]
string_pad = ["polars-lazy?/string_pad", "polars-ops/string_pad"]
string_normalize = ["polars-lazy?/string_normalize", "polars-ops/string_normalize"]
string_reverse = ["polars-lazy?/string_reverse", "polars-ops/string_reverse"]
string_to_integer = ["polars-lazy?/string_to_integer", "polars-ops/string_to_integer"]
take_opt_iter = ["polars-core/take_opt_iter"]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ The following methods are available under the `expr.str` attribute.
Expr.str.json_path_match
Expr.str.len_bytes
Expr.str.len_chars
Expr.str.normalize
Expr.str.pad_end
Expr.str.pad_start
Expr.str.replace
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ The following methods are available under the `Series.str` attribute.
Series.str.json_path_match
Series.str.len_bytes
Series.str.len_chars
Series.str.normalize
Series.str.pad_end
Series.str.pad_start
Series.str.replace
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"sunday",
]
TimeUnit: TypeAlias = Literal["ns", "us", "ms"]
UnicodeForm: TypeAlias = Literal["NFC", "NFKC", "NFD", "NFKD"]
UniqueKeepStrategy: TypeAlias = Literal["first", "last", "any", "none"]
UnstackDirection: TypeAlias = Literal["vertical", "horizontal"]
MapElementsStrategy: TypeAlias = Literal["thread_local", "threading"]
Expand Down
Loading
Loading