From d34fa3586868727757a41300df9c4aa98db3d7b8 Mon Sep 17 00:00:00 2001
From: Minho Ryang <minhoryang@gmail.com>
Date: Wed, 11 Oct 2023 23:26:00 +0900
Subject: [PATCH] [docstring] Fix docstring typo at `LlamaTokenizer` and
 `LlamaTokenizerFast`

---
 src/transformers/models/llama/tokenization_llama.py      | 7 ++++---
 src/transformers/models/llama/tokenization_llama_fast.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index d357616d5fc78e..dcf1d8660f04db 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -72,11 +72,12 @@ class LlamaTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             Path to the vocabulary file.
         unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
-            A special token representing an out-of-vocabulary token.
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
         bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
-            A special token representing the beginning of a sentence.
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
         eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
-            A special token representing the end of a sentence.
+            The end of sequence token.
         pad_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
             attention mechanisms or loss computation.
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index bf433fd3b9be86..229272e0045f43 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -84,7 +84,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
         tokenizer_file (`str`, *optional*):
             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
             contains everything needed to load the tokenizer.
-        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
             Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
             extra spaces.
         unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):