diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cd5dd9435bece..06e3016cc9e5c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -529,8 +529,10 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # used for `\n` / `\t` have been manually added in the added tokens # To avoid unexpected issues - we make sure to encode single-char tokens if len(token) == 1: - logger.info("Ecode-Decode special characters using AutoTokenizer") + previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL)