diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index 3d26a85e5..57e0783c8 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -173,12 +173,15 @@ def from_spm(filename: str): tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback)) - tokenizer.normalizer = normalizers.Sequence( - [ - normalizers.Precompiled(precompiled_charsmap), - normalizers.Replace(Regex(" {2,}"), " "), - ] - ) + if precompiled_charsmap: + tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Precompiled(precompiled_charsmap), + normalizers.Replace(Regex(" {2,}"), " "), + ] + ) + else: + tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")]) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)