From 6c6066a76027f8f3b582072e937144ccf74568a5 Mon Sep 17 00:00:00 2001 From: Vijay Date: Mon, 28 Oct 2024 21:44:07 +0530 Subject: [PATCH] [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details (#34322) * [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details * [docs] correct input documentation for MISTRAL model to reference `input_ids` instead of `decoder_input_ids` * [docs] clarify cache_position description in MISTRAL model documentation --- src/transformers/models/mamba2/modeling_mamba2.py | 10 ++++++++++ src/transformers/models/mistral/modeling_mistral.py | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 110ae09a388704..c312b9b94351d2 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -805,6 +805,16 @@ class Mamba2CausalLMOutput(ModelOutput): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the current input in the cache. This is used to ensure that the cache is correctly updated. + If `cache_params` is passed, `cache_position` should also be passed. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) """ diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 321d3dc0daf378..3b0fb75a4cb3ba 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -619,7 +619,7 @@ def _init_weights(self, module): Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see `past_key_values`). If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] @@ -666,6 +666,10 @@ def _init_weights(self, module): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices indicating the position of the input sequence tokens in the sequence. Unlike `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. """