From 91393ef75e3d6075a689a2898a5a844b369d1b09 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 17 Apr 2024 09:32:40 +0200 Subject: [PATCH] Fixing doc. (#1499) * Fixing doc. * SentencePieceUnigram and Convert.py still used sentencepiece * stub --------- Co-authored-by: Arthur Zucker --- bindings/python/py_src/tokenizers/decoders/__init__.pyi | 4 +++- .../tokenizers/implementations/sentencepiece_unigram.py | 5 +++-- .../python/py_src/tokenizers/pre_tokenizers/__init__.pyi | 5 ++++- bindings/python/scripts/convert.py | 6 +++--- bindings/python/src/decoders.rs | 4 +++- bindings/python/src/pre_tokenizers.rs | 5 ++++- bindings/python/tests/bindings/test_pre_tokenizers.py | 2 -- 7 files changed, 20 insertions(+), 11 deletions(-) diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 7d8e2334a..b967fbd14 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -152,9 +152,11 @@ class Metaspace(Decoder): The replacement character. Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece). - add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. + Choices: "always", "never", "first". First means the space is only added on the first + token (relevant when special tokens are used or other pre_tokenizer are used). """ def __init__(self, replacement="▁", prepend_scheme="always", split=True): pass diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index 6eb3ceb05..1237e85eb 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -183,8 +183,9 @@ def from_spm(filename: str): ) else: tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")]) - tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) - tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + prepend_scheme = "always" if add_prefix_space else "never" + tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) parameters = { "model": "SentencePieceUnigram", diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index d2019e6fb..d81d3802b 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -270,9 +270,12 @@ class Metaspace(PreTokenizer): The replacement character. Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece). - add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. + Choices: "always", "never", "first". First means the space is only added on the first + token (relevant when special tokens are used or other pre_tokenizer are used). + """ def __init__(self, replacement="_", prepend_scheme="always", split=True): pass diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index e6df5ad8a..50c13862b 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -102,9 +102,9 @@ def converted(self): tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" - add_prefix_space = True - tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) - tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + prepend_scheme = "always" + tokenizer.pre_tokenizer = Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 1344aa3f8..ed21f3469 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -304,9 +304,11 @@ impl PyStrip { /// The replacement character. Must be exactly one character. By default we /// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). /// -/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): /// Whether to add a space to the first word if there isn't already one. This /// lets us treat `hello` exactly like `say hello`. +/// Choices: "always", "never", "first". First means the space is only added on the first +/// token (relevant when special tokens are used or other pre_tokenizer are used). #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")] pub struct PyMetaspaceDec {} #[pymethods] diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index d6d332f05..a2bd9b39c 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -477,9 +477,12 @@ pub(crate) fn from_string(string: String) -> Result { /// The replacement character. Must be exactly one character. By default we /// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). /// -/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): /// Whether to add a space to the first word if there isn't already one. This /// lets us treat `hello` exactly like `say hello`. +/// Choices: "always", "never", "first". First means the space is only added on the first +/// token (relevant when special tokens are used or other pre_tokenizer are used). +/// #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")] pub struct PyMetaspace {} #[pymethods] diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index e6b74548d..fda9adb2a 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -109,8 +109,6 @@ def test_can_modify(self): # Modify these pretok.replacement = "%" assert pretok.replacement == "%" - pretok.add_prefix_space = True - assert pretok.add_prefix_space == True pretok.prepend_scheme = "first" assert pretok.prepend_scheme == "first" pretok.split = True