From 8fce5b59f17a1c9cb827fd78d44134cee267e15b Mon Sep 17 00:00:00 2001
From: Conor Kennedy <32619800+Vince7778@users.noreply.github.com>
Date: Mon, 29 Jul 2024 15:42:34 -0700
Subject: [PATCH] [BUG] Fix `.str.length()` on Unicode strings (#2579)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the `.str.length()` method would count the number of bytes
in the UTF-8 string. This is inconsistent with Python's `len()` and
pandas' `str.len()` which count Unicode codepoints. For instance, on the
string "😉test", the number of bytes is 8, whereas the number of
codepoints is 5. This PR makes Daft consistent with that behavior.

There doesn't seem to be a way now to reproduce the original behavior;
maybe we should add a `.byte_length()` method for that.
---
 src/daft-core/src/array/ops/utf8.rs | 2 +-
 tests/series/test_utf8_ops.py       | 6 ++++++
 tests/table/utf8/test_length.py     | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/daft-core/src/array/ops/utf8.rs b/src/daft-core/src/array/ops/utf8.rs
index c7edc3df24..e82e07ffa1 100644
--- a/src/daft-core/src/array/ops/utf8.rs
+++ b/src/daft-core/src/array/ops/utf8.rs
@@ -610,7 +610,7 @@ impl Utf8Array {
             .iter()
             .map(|val| {
                 let v = val?;
-                Some(v.len() as u64)
+                Some(v.chars().count() as u64)
             })
             .collect::<arrow2::array::UInt64Array>()
             .with_validity(self_arrow.validity().cloned());
diff --git a/tests/series/test_utf8_ops.py b/tests/series/test_utf8_ops.py
index e4c7bcb888..4f0df85b4b 100644
--- a/tests/series/test_utf8_ops.py
+++ b/tests/series/test_utf8_ops.py
@@ -265,6 +265,12 @@ def test_series_utf8_length_all_null() -> None:
     assert result.to_pylist() == [None, None, None]
 
 
+def test_series_utf8_length_unicode() -> None:
+    s = Series.from_arrow(pa.array(["😉test", "hey̆"]))
+    result = s.str.length()
+    assert result.to_pylist() == [5, 4]
+
+
 @pytest.mark.parametrize(
     ["data", "expected"],
     [
diff --git a/tests/table/utf8/test_length.py b/tests/table/utf8/test_length.py
index 10e00e065d..c96815ed64 100644
--- a/tests/table/utf8/test_length.py
+++ b/tests/table/utf8/test_length.py
@@ -5,6 +5,6 @@
 
 
 def test_utf8_length():
-    table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux"]})
+    table = MicroPartition.from_pydict({"col": ["foo", None, "barbaz", "quux", "😉test", ""]})
     result = table.eval_expression_list([col("col").str.length()])
-    assert result.to_pydict() == {"col": [3, None, 6, 4]}
+    assert result.to_pydict() == {"col": [3, None, 6, 4, 5, 0]}