Skip to content

Commit

Permalink
tokenizer train from iterator without pre_tokenizers (#35396)
Browse files Browse the repository at this point in the history
* fix if else issues

* add a test

* fix the test

* style
  • Loading branch information
ArthurZucker authored Jan 9, 2025
1 parent 320512d commit eb4579c
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 11 deletions.
22 changes: 11 additions & 11 deletions src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,17 +813,17 @@ def train_new_from_iterator(
kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
kwargs["unk_token"] = unk_token
if (
tokenizer_json["pre_tokenizer"] is not None
and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
and "pretokenizers" in tokenizer_json["pre_tokenizer"]
and any(
pretokenizer["type"] == "ByteLevel"
for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
)
):
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
if tokenizer_json["pre_tokenizer"] is not None:
if (
tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
and "pretokenizers" in tokenizer_json["pre_tokenizer"]
and any(
pretokenizer["type"] == "ByteLevel"
for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
)
):
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()

trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
Expand Down
19 changes: 19 additions & 0 deletions tests/tokenization/test_tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@


if is_tokenizers_available():
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

Expand Down Expand Up @@ -428,3 +429,21 @@ def test_sentencepiece_cohabitation(self):
# Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
# was already imported.
import_protobuf()

def test_training_new_tokenizer_edge_cases(self):
_tokenizer = Tokenizer(tokenizers.models.BPE(vocab={"a": 1, "b": 2, "ab": 3}, merges=[("a", "b")]))
_tokenizer.pre_tokenizer = None

tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
toy_text_iterator = ("a" for _ in range(1000))
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)

_tokenizer.normalizer = None
tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
toy_text_iterator = ("a" for _ in range(1000))
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)

_tokenizer.post_processor = None
tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
toy_text_iterator = ("a" for _ in range(1000))
tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)

0 comments on commit eb4579c

Please sign in to comment.