Fix doc about split (#1591)

* update doc * add example * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry <[email protected]> * stub --------- Co-authored-by: Nicolas Patry <[email protected]>
huggingface · Aug 7, 2024 · eea8e1a · eea8e1a
1 parent 6a5fce9
commit eea8e1a
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 4 deletions.
diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -421,8 +421,11 @@ class Split(PreTokenizer):
 
     Args:
         pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+            otherwise we consider is as a string pattern. For example `pattern="|"`
+            means you want to split on `|` (imagine a csv file for example), while
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
         behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
             The behavior to use when splitting.
             Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
@@ -334,8 +334,11 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-///
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+///         If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+///         otherwise we consider is as a string pattern. For example `pattern="|"`
+///         means you want to split on `|` (imagine a csv file for example), while
+///         `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
 ///         Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",