From 8fce5b59f17a1c9cb827fd78d44134cee267e15b Mon Sep 17 00:00:00 2001 From: Conor Kennedy <32619800+Vince7778@users.noreply.github.com> Date: Mon, 29 Jul 2024 15:42:34 -0700 Subject: [PATCH] [BUG] Fix `.str.length()` on Unicode strings (#2579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the `.str.length()` method would count the number of bytes in the UTF-8 string. This is inconsistent with Python's `len()` and pandas' `str.len()` which count Unicode codepoints. For instance, on the string "😉test", the number of bytes is 8, whereas the number of codepoints is 5. This PR makes Daft consistent with that behavior. There doesn't seem to be a way now to reproduce the original behavior; maybe we should add a `.byte_length()` method for that. --- src/daft-core/src/array/ops/utf8.rs | 2 +- tests/series/test_utf8_ops.py | 6 ++++++ tests/table/utf8/test_length.py | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/daft-core/src/array/ops/utf8.rs b/src/daft-core/src/array/ops/utf8.rs index c7edc3df24..e82e07ffa1 100644 --- a/src/daft-core/src/array/ops/utf8.rs +++ b/src/daft-core/src/array/ops/utf8.rs @@ -610,7 +610,7 @@ impl Utf8Array { .iter() .map(|val| { let v = val?; - Some(v.len() as u64) + Some(v.chars().count() as u64) }) .collect::() .with_validity(self_arrow.validity().cloned()); diff --git a/tests/series/test_utf8_ops.py b/tests/series/test_utf8_ops.py index e4c7bcb888..4f0df85b4b 100644 --- a/tests/series/test_utf8_ops.py +++ b/tests/series/test_utf8_ops.py @@ -265,6 +265,12 @@ def test_series_utf8_length_all_null() -> None: assert result.to_pylist() == [None, None, None] +def test_series_utf8_length_unicode() -> None: + s = Series.from_arrow(pa.array(["😉test", "hey̆"])) + result = s.str.length() + assert result.to_pylist() == [5, 4] + + @pytest.mark.parametrize( ["data", "expected"], [ diff --git a/tests/table/utf8/test_length.py b/tests/table/utf8/test_length.py index 10e00e065d..c96815ed64 100644 --- a/tests/table/utf8/test_length.py +++ b/tests/table/utf8/test_length.py @@ -5,6 +5,6 @@ def test_utf8_length(): - table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux"]}) + table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux", "😉test", ""]}) result = table.eval_expression_list([col("col").str.length()]) - assert result.to_pydict() == {"col": [3, None, 6, 4]} + assert result.to_pydict() == {"col": [3, None, 6, 4, 5, 0]}