[docstring] Fix docstring typo at LlamaTokenizer and `LlamaTokenize…

…rFast`
huggingface · Oct 11, 2023 · d34fa35 · d34fa35
1 parent 1231d41
commit d34fa35
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 4 deletions.
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
@@ -72,11 +72,12 @@ class LlamaTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             Path to the vocabulary file.
         unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
-            A special token representing an out-of-vocabulary token.
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
         bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
-            A special token representing the beginning of a sentence.
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
         eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
-            A special token representing the end of a sentence.
+            The end of sequence token.
         pad_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
             attention mechanisms or loss computation.

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -84,7 +84,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
         tokenizer_file (`str`, *optional*):
             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
             contains everything needed to load the tokenizer.
-        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
             Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
             extra spaces.
         unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):