Skip to content

Commit

Permalink
[CodeLlamaTokenizer] Nit, update __init__ to make sure the AddedTok…
Browse files Browse the repository at this point in the history
…ens are not normalized because they are special (huggingface#27359)

* make sure tokens are properly initialized for codellama slow

* add m ore pretrained models

* style

* test more tokenizers checkpoints
  • Loading branch information
ArthurZucker authored and EduardoPach committed Nov 19, 2023
1 parent 5f18f03 commit 541cc06
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/transformers/models/code_llama/tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,9 @@ def __init__(
):
requires_backends(self, "protobuf")
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token

self.use_default_system_prompt = use_default_system_prompt
# mark tokens special to skip them
Expand Down
2 changes: 2 additions & 0 deletions tests/models/code_llama/test_tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def test_save_pretrained(self):
self.tokenizers_list = [
(self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
(self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
(self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
(self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
]
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
Expand Down

0 comments on commit 541cc06

Please sign in to comment.