diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 99b29e5668..0614124007 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -2568,20 +2568,20 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression: def normalize( self, *, - remove_punct: bool = True, - lowercase: bool = True, - nfd_unicode: bool = True, - white_space: bool = True, + remove_punct: bool = False, + lowercase: bool = False, + nfd_unicode: bool = False, + white_space: bool = False, ): """Normalizes a string for more useful deduplication. .. NOTE:: - All processing options are on by default. + All processing options are off by default. Example: >>> import daft >>> df = daft.from_pydict({"x": ["hello world", "Hello, world!", "HELLO, \\nWORLD!!!!"]}) - >>> df = df.with_column("normalized", df["x"].str.normalize()) + >>> df = df.with_column("normalized", df["x"].str.normalize(remove_punct=True, lowercase=True, white_space=True)) >>> df.show() ╭───────────────┬─────────────╮ │ x ┆ normalized │ diff --git a/daft/series.py b/daft/series.py index 096e20550f..2b7329986f 100644 --- a/daft/series.py +++ b/daft/series.py @@ -891,10 +891,10 @@ def substr(self, start: Series, length: Series | None = None) -> Series: def normalize( self, *, - remove_punct: bool = True, - lowercase: bool = True, - nfd_unicode: bool = True, - white_space: bool = True, + remove_punct: bool = False, + lowercase: bool = False, + nfd_unicode: bool = False, + white_space: bool = False, ) -> Series: if not isinstance(remove_punct, bool): raise ValueError(f"expected bool for remove_punct but got {type(remove_punct)}")