diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 1dbe6731852eed..964e68c9611379 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -68,6 +68,11 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): Args: vocab_file (`str`): Path to the vocabulary file. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. eos_token (`str`, *optional*, defaults to `""`): The end of sequence token. @@ -78,23 +83,18 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. prefix_token (`str`, *optional*, defaults to `"▁
"`):
             Prefix token used for infilling.
-        suffix_token (`str`, *optional*, defaults to `"▁"`):
-            Suffix token used for infilling.
         middle_token (`str`, *optional*, defaults to `"▁"`):
             Middle token used for infilling.
+        suffix_token (`str`, *optional*, defaults to `"▁"`):
+            Suffix token used for infilling.
         eot_token (`str`, *optional*, defaults to `"▁"`):
             End of text token used for infilling.
         fill_token (`str`, *optional*, defaults to `""`):
             The token used to split the input between the prefix and suffix.
-        suffix_first (`bool`, *optional*, default to `False`):
+        suffix_first (`bool`, *optional*, defaults to `False`):
             Whether the input prompt and suffix should be formatted with the suffix first.
-        additional_special_tokens (`List[str]`, *optional*):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
@@ -110,6 +110,14 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether to add a beginning of sequence token at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether to add an end of sequence token at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean up the tokenization spaces.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
         use_default_system_prompt (`bool`, *optional*, defaults to `False`):
             Whether or not the default system prompt for Llama should be used.
     """
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index c86d0ae8e64c23..ef4562aa9a1e94 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -132,7 +132,6 @@
     "CodeGenConfig",
     "CodeGenTokenizer",
     "CodeGenTokenizerFast",
-    "CodeLlamaTokenizer",
     "CodeLlamaTokenizerFast",
     "ConditionalDetrConfig",
     "ConditionalDetrImageProcessor",