From eb4579cf43bf8911b5556506ad1379d566a2ef4d Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:34:43 +0100 Subject: [PATCH] `tokenizer` train from iterator without pre_tokenizers (#35396) * fix if else issues * add a test * fix the test * style --- src/transformers/tokenization_utils_fast.py | 22 +++++++++---------- tests/tokenization/test_tokenization_utils.py | 19 ++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index d1353adfd22589..cc7edbd5328523 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -813,17 +813,17 @@ def train_new_from_iterator( kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"] if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None: kwargs["unk_token"] = unk_token - if ( - tokenizer_json["pre_tokenizer"] is not None - and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel" - or tokenizer_json["pre_tokenizer"]["type"] == "Sequence" - and "pretokenizers" in tokenizer_json["pre_tokenizer"] - and any( - pretokenizer["type"] == "ByteLevel" - for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"] - ) - ): - kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet() + if tokenizer_json["pre_tokenizer"] is not None: + if ( + tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel" + or tokenizer_json["pre_tokenizer"]["type"] == "Sequence" + and "pretokenizers" in tokenizer_json["pre_tokenizer"] + and any( + pretokenizer["type"] == "ByteLevel" + for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"] + ) + ): + kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet() trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]] trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs) diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 5171af67300813..a9f2b1cd9b75e9 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -48,6 +48,7 @@ if is_tokenizers_available(): + import tokenizers from tokenizers import Tokenizer from tokenizers.models import WordPiece @@ -428,3 +429,21 @@ def test_sentencepiece_cohabitation(self): # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf # was already imported. import_protobuf() + + def test_training_new_tokenizer_edge_cases(self): + _tokenizer = Tokenizer(tokenizers.models.BPE(vocab={"a": 1, "b": 2, "ab": 3}, merges=[("a", "b")])) + _tokenizer.pre_tokenizer = None + + tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer) + toy_text_iterator = ("a" for _ in range(1000)) + tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50) + + _tokenizer.normalizer = None + tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer) + toy_text_iterator = ("a" for _ in range(1000)) + tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50) + + _tokenizer.post_processor = None + tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer) + toy_text_iterator = ("a" for _ in range(1000)) + tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)