From 7da739745cbbf277cd6e07d16d8f97c7c9f4d094 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 5 Aug 2024 10:31:12 +0200 Subject: [PATCH 1/4] update doc --- bindings/python/src/pre_tokenizers.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index a2bd9b39c..9835a2ed7 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -324,8 +324,7 @@ impl PyWhitespaceSplit { /// /// Args: /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): -/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` -/// +/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern. /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. /// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", From bce0545af306944765995b5856998b171b5a0b90 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 5 Aug 2024 10:38:07 +0200 Subject: [PATCH 2/4] add example --- bindings/python/src/pre_tokenizers.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 9835a2ed7..66f4646f5 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -324,7 +324,11 @@ impl PyWhitespaceSplit { /// /// Args: /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): -/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern. +/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. +/// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, +/// otherwise we consider is as a string pattern. For example `pattern="1|2"` +/// means you want to split on `1|2` (imagine a csv file for example), while +/// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. /// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", From 0321308b53c393556c2beda6731d99c36bb51c37 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:13:44 +0200 Subject: [PATCH 3/4] Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry --- bindings/python/src/pre_tokenizers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 66f4646f5..d8b9a112f 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -326,8 +326,8 @@ impl PyWhitespaceSplit { /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): /// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. /// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, -/// otherwise we consider is as a string pattern. For example `pattern="1|2"` -/// means you want to split on `1|2` (imagine a csv file for example), while +/// otherwise we consider is as a string pattern. For example `pattern="|"` +/// means you want to split on `|` (imagine a csv file for example), while /// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. From cf610ed8ab6e532447349a90a9357d1d805efe25 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 7 Aug 2024 12:26:06 +0200 Subject: [PATCH 4/4] stub --- .../python/py_src/tokenizers/pre_tokenizers/__init__.pyi | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index d81d3802b..ea1b4954e 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -421,8 +421,11 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` - + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. + If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, + otherwise we consider is as a string pattern. For example `pattern="|"` + means you want to split on `|` (imagine a csv file for example), while + `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",