diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index d357616d5fc78e..dcf1d8660f04db 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -72,11 +72,12 @@ class LlamaTokenizer(PreTrainedTokenizer): vocab_file (`str`): Path to the vocabulary file. unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): - A special token representing an out-of-vocabulary token. + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): - A special token representing the beginning of a sentence. + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): - A special token representing the end of a sentence. + The end of sequence token. pad_token (`str` or `tokenizers.AddedToken`, *optional*): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index bf433fd3b9be86..229272e0045f43 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -84,7 +84,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): tokenizer_file (`str`, *optional*): [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that contains everything needed to load the tokenizer. - clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`): + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra spaces. unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):