From 7da739745cbbf277cd6e07d16d8f97c7c9f4d094 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 5 Aug 2024 10:31:12 +0200
Subject: [PATCH 1/4] update doc

---
 bindings/python/src/pre_tokenizers.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index a2bd9b39c..9835a2ed7 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -324,8 +324,7 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-///
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
 ///         Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",

From bce0545af306944765995b5856998b171b5a0b90 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 5 Aug 2024 10:38:07 +0200
Subject: [PATCH 2/4] add example

---
 bindings/python/src/pre_tokenizers.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index 9835a2ed7..66f4646f5 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -324,7 +324,11 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern.
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+///         If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+///         otherwise we consider is as a string pattern. For example `pattern="1|2"`
+///         means you want to split on `1|2` (imagine a csv file for example), while
+///         `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
 ///         Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",

From 0321308b53c393556c2beda6731d99c36bb51c37 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:13:44 +0200
Subject: [PATCH 3/4] Update bindings/python/src/pre_tokenizers.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 bindings/python/src/pre_tokenizers.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index 66f4646f5..d8b9a112f 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -326,8 +326,8 @@ impl PyWhitespaceSplit {
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
 ///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
 ///         If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
-///         otherwise we consider is as a string pattern. For example `pattern="1|2"`
-///         means you want to split on `1|2` (imagine a csv file for example), while
+///         otherwise we consider is as a string pattern. For example `pattern="|"`
+///         means you want to split on `|` (imagine a csv file for example), while
 ///         `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.

From cf610ed8ab6e532447349a90a9357d1d805efe25 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 7 Aug 2024 12:26:06 +0200
Subject: [PATCH 4/4] stub

---
 .../python/py_src/tokenizers/pre_tokenizers/__init__.pyi   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
index d81d3802b..ea1b4954e 100644
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -421,8 +421,11 @@ class Split(PreTokenizer):
 
     Args:
         pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+            otherwise we consider is as a string pattern. For example `pattern="|"`
+            means you want to split on `|` (imagine a csv file for example), while
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
         behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
             The behavior to use when splitting.
             Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",