From 09687e4aa8787d00157a4a8f7f91b7b262726c5f Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 9 Jan 2025 12:25:27 +0400 Subject: [PATCH] fix: Ensure that SQL `LIKE` and `ILIKE` operators support multi-line matches (#20613) --- crates/polars-sql/src/context.rs | 2 +- crates/polars-sql/src/sql_expr.rs | 6 +++++- py-polars/tests/unit/sql/test_strings.py | 26 ++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 1957e5c62e51..6a76f290d4dd 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -1355,7 +1355,7 @@ impl SQLContext { .replace('%', ".*") .replace('_', "."); - modifiers.ilike = Some(regex::Regex::new(format!("^(?i){}$", rx).as_str()).unwrap()); + modifiers.ilike = Some(regex::Regex::new(format!("^(?is){}$", rx).as_str()).unwrap()); } // SELECT * RENAME diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index 028677a9506a..4ecb2f8f9ad4 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -357,7 +357,11 @@ impl SQLExprVisitor<'_> { .replace('%', ".*") .replace('_', "."); - rx = format!("^{}{}$", if case_insensitive { "(?i)" } else { "" }, rx); + rx = format!( + "^{}{}$", + if case_insensitive { "(?is)" } else { "(?s)" }, + rx + ); let expr = self.visit_expr(expr)?; let matches = expr.str().contains(lit(rx), true); diff --git a/py-polars/tests/unit/sql/test_strings.py b/py-polars/tests/unit/sql/test_strings.py index 46e3c645a85a..0405a47e665e 100644 --- a/py-polars/tests/unit/sql/test_strings.py +++ b/py-polars/tests/unit/sql/test_strings.py @@ -249,6 +249,32 @@ def test_string_like(pattern: str, like: str, expected: list[int]) -> None: assert res == expected +def test_string_like_multiline() -> None: + s1 = "Hello World" + s2 = "Hello\nWorld" + s3 = "hello\nWORLD" + + df = pl.DataFrame({"idx": [0, 1, 2], "txt": [s1, s2, s3]}) + + # starts with... + res1 = df.sql("SELECT * FROM self WHERE txt LIKE 'Hello%' ORDER BY idx") + res2 = df.sql("SELECT * FROM self WHERE txt ILIKE 'HELLO%' ORDER BY idx") + + assert res1["txt"].to_list() == [s1, s2] + assert res2["txt"].to_list() == [s1, s2, s3] + + # ends with... + res3 = df.sql("SELECT * FROM self WHERE txt LIKE '%WORLD' ORDER BY idx") + res4 = df.sql("SELECT * FROM self WHERE txt ILIKE '%\nWORLD' ORDER BY idx") + + assert res3["txt"].to_list() == [s3] + assert res4["txt"].to_list() == [s2, s3] + + # exact match + for s in (s1, s2, s3): + assert df.sql(f"SELECT txt FROM self WHERE txt LIKE '{s}'").item() == s + + def test_string_position() -> None: df = pl.Series( name="city",