diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index ce7fcc5319d13e..b4531e9c957c9f 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -45,7 +45,7 @@ inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt") generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) ``` -The `generation_output` object is a [`~generation.GreedySearchDecoderOnlyOutput`], as we can +The `generation_output` object is a [`~generation.GenerateDecoderOnlyOutput`], as we can see in the documentation of that class below, it means it has the following attributes: - `sequences`: the generated sequences of tokens @@ -77,25 +77,13 @@ We document here all output types. ### PyTorch -[[autodoc]] generation.GreedySearchEncoderDecoderOutput +[[autodoc]] generation.GenerateDecoderOnlyOutput -[[autodoc]] generation.GreedySearchDecoderOnlyOutput +[[autodoc]] generation.GenerateEncoderDecoderOutput -[[autodoc]] generation.SampleEncoderDecoderOutput +[[autodoc]] generation.GenerateBeamDecoderOnlyOutput -[[autodoc]] generation.SampleDecoderOnlyOutput - -[[autodoc]] generation.BeamSearchEncoderDecoderOutput - -[[autodoc]] generation.BeamSearchDecoderOnlyOutput - -[[autodoc]] generation.BeamSampleEncoderDecoderOutput - -[[autodoc]] generation.BeamSampleDecoderOnlyOutput - -[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput - -[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput +[[autodoc]] generation.GenerateBeamEncoderDecoderOutput ### TensorFlow diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md index df3860410bc676..96624971104d2b 100644 --- a/docs/source/ja/internal/generation_utils.md +++ b/docs/source/ja/internal/generation_utils.md @@ -45,7 +45,7 @@ inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt") generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) ``` -`generation_output` オブジェクトは、できる限り [`~generation.GreedySearchDecoderOnlyOutput`] です。 +`generation_output` オブジェクトは、できる限り [`~generation.GenerateDecoderOnlyOutput`] です。 以下のそのクラスのドキュメントを参照してください。これは、次の属性があることを意味します。 - `sequences`: 生成されたトークンのシーケンス @@ -76,25 +76,13 @@ generation_output[:2] ### PyTorch -[[autodoc]] generation.GreedySearchEncoderDecoderOutput +[[autodoc]] generation.GenerateDecoderOnlyOutput -[[autodoc]] generation.GreedySearchDecoderOnlyOutput +[[autodoc]] generation.GenerateEncoderDecoderOutput -[[autodoc]] generation.SampleEncoderDecoderOutput +[[autodoc]] generation.GenerateBeamDecoderOnlyOutput -[[autodoc]] generation.SampleDecoderOnlyOutput - -[[autodoc]] generation.BeamSearchEncoderDecoderOutput - -[[autodoc]] generation.BeamSearchDecoderOnlyOutput - -[[autodoc]] generation.BeamSampleEncoderDecoderOutput - -[[autodoc]] generation.BeamSampleDecoderOnlyOutput - -[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput - -[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput +[[autodoc]] generation.GenerateBeamEncoderDecoderOutput ### TensorFlow diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md index d8013ac87dcb21..a8e191f1ca9978 100644 --- a/docs/source/zh/internal/generation_utils.md +++ b/docs/source/zh/internal/generation_utils.md @@ -43,7 +43,7 @@ inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt") generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) ``` -`generation_output` 的对象是 [`~generation.GreedySearchDecoderOnlyOutput`] 的一个实例,从该类的文档中我们可以看到,这意味着它具有以下属性: +`generation_output` 的对象是 [`~generation.GenerateDecoderOnlyOutput`] 的一个实例,从该类的文档中我们可以看到,这意味着它具有以下属性: - `sequences`: 生成的tokens序列 - `scores`(可选): 每个生成步骤的语言建模头的预测分数 @@ -70,25 +70,13 @@ generation_output[:2] ### PyTorch -[[autodoc]] generation.GreedySearchEncoderDecoderOutput +[[autodoc]] generation.GenerateDecoderOnlyOutput -[[autodoc]] generation.GreedySearchDecoderOnlyOutput +[[autodoc]] generation.GenerateEncoderDecoderOutput -[[autodoc]] generation.SampleEncoderDecoderOutput +[[autodoc]] generation.GenerateBeamDecoderOnlyOutput -[[autodoc]] generation.SampleDecoderOnlyOutput - -[[autodoc]] generation.BeamSearchEncoderDecoderOutput - -[[autodoc]] generation.BeamSearchDecoderOnlyOutput - -[[autodoc]] generation.BeamSampleEncoderDecoderOutput - -[[autodoc]] generation.BeamSampleDecoderOnlyOutput - -[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput - -[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput +[[autodoc]] generation.GenerateBeamEncoderDecoderOutput ### TensorFlow diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index a46cb4fa910ada..d1e81cffca67ed 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -94,6 +94,10 @@ "BeamSampleDecoderOnlyOutput", "ContrastiveSearchEncoderDecoderOutput", "ContrastiveSearchDecoderOnlyOutput", + "GenerateBeamDecoderOnlyOutput", + "GenerateBeamEncoderDecoderOutput", + "GenerateDecoderOnlyOutput", + "GenerateEncoderDecoderOutput", ] try: @@ -222,6 +226,10 @@ BeamSearchEncoderDecoderOutput, ContrastiveSearchDecoderOnlyOutput, ContrastiveSearchEncoderDecoderOutput, + GenerateBeamDecoderOnlyOutput, + GenerateBeamEncoderDecoderOutput, + GenerateDecoderOnlyOutput, + GenerateEncoderDecoderOutput, GenerationMixin, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput, diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index f36f76a27a390a..9df52bee165685 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -94,10 +94,9 @@ @dataclass -class GreedySearchDecoderOnlyOutput(ModelOutput): +class GenerateDecoderOnlyOutput(ModelOutput): """ - Base class for outputs of decoder-only generation models using greedy search. - + Outputs of decoder-only generation models, when using non-beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -130,9 +129,9 @@ class GreedySearchDecoderOnlyOutput(ModelOutput): @dataclass -class ContrastiveSearchEncoderDecoderOutput(ModelOutput): +class GenerateEncoderDecoderOutput(ModelOutput): """ - Base class for outputs of decoder-only generation models using contrastive search. + Outputs of encoder-decider generation models, when using non-beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -177,184 +176,9 @@ class ContrastiveSearchEncoderDecoderOutput(ModelOutput): @dataclass -class ContrastiveSearchDecoderOnlyOutput(ModelOutput): +class GenerateBeamDecoderOnlyOutput(ModelOutput): """ - Base class for outputs of decoder-only generation models using contrastive search. - - Args: - sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when - `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for - each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is - passed or when `config.output_hidden_states=True`): Tuple (one element for each generated token) of tuples - (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, generated_length, - hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None - - -@dataclass -class GreedySearchEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for - each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None - - -@dataclass -class SampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using sampling. - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for - each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, - sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None - - -@dataclass -class SampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of - the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for - each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape - `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, - sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None - - -@dataclass -class BeamSearchDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam search. + Outputs of decoder-only generation models, when using beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): @@ -395,11 +219,9 @@ class BeamSearchDecoderOnlyOutput(ModelOutput): @dataclass -class BeamSearchEncoderDecoderOutput(ModelOutput): +class GenerateBeamEncoderDecoderOutput(ModelOutput): """ - Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights - of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + Outputs of encoder-decoder generation models, when using beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): @@ -452,112 +274,26 @@ class BeamSearchEncoderDecoderOutput(ModelOutput): past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -@dataclass -class BeamSampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam sample. - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), - with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`. - beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `torch.LongTensor` of shape - `(batch_size*num_return_sequences, sequence_length)`. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ - - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None - +# Equivalent classes (kept for retrocompatibility purposes) +GreedySearchDecoderOnlyOutput = GenerateDecoderOnlyOutput +ContrastiveSearchDecoderOnlyOutput = GenerateDecoderOnlyOutput +SampleDecoderOnlyOutput = GenerateDecoderOnlyOutput -@dataclass -class BeamSampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) +ContrastiveSearchEncoderDecoderOutput = GenerateEncoderDecoderOutput +GreedySearchEncoderDecoderOutput = GenerateEncoderDecoderOutput +SampleEncoderDecoderOutput = GenerateEncoderDecoderOutput - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), - with each tensor of shape `(batch_size*num_beams, config.vocab_size)`). - beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `torch.LongTensor` of shape - `(batch_size*num_return_sequences, sequence_length)`. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size*num_beams, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - NOTE: some models have a different `past_key_values` format, confirm with the model's documentation. - Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value - tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, - encoder_sequence_length, embed_size_per_head)`. - """ +BeamSearchDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput +BeamSampleDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None +BeamSearchEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput +BeamSampleEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput -GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] -SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput] -BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] -BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput] -ContrastiveSearchOutput = Union[ContrastiveSearchEncoderDecoderOutput, ContrastiveSearchDecoderOnlyOutput] -GenerateOutput = Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, ContrastiveSearchOutput] +# Typing shortcuts +GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput] +GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput] +GenerateOutput = Union[GenerateNonBeamOutput, GenerateBeamOutput] class GenerationMode(ExplicitEnum): @@ -1516,18 +1252,14 @@ def generate( If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchDecoderOnlyOutput`], - - [`~generation.SampleDecoderOnlyOutput`], - - [`~generation.BeamSearchDecoderOnlyOutput`], - - [`~generation.BeamSampleDecoderOnlyOutput`] + - [`~generation.GenerateDecoderOnlyOutput`], + - [`~generation.GenerateBeamDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ if synced_gpus is None: @@ -1989,7 +1721,7 @@ def contrastive_search( streamer: Optional["BaseStreamer"] = None, sequential: Optional[bool] = None, **model_kwargs, - ) -> Union[ContrastiveSearchOutput, torch.LongTensor]: + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **contrastive search** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -2045,10 +1777,10 @@ def contrastive_search( If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.ContrastiveSearchDecoderOnlyOutput`], [`~generation.ContrastiveSearchEncoderDecoderOutput`] + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.ContrastiveSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.ContrastiveSearchEncoderDecoderOutput`] if + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -2406,7 +2138,7 @@ def contrastive_search( model_kwargs["past_key_values"] = tuple(past_key_values) if self.config.is_encoder_decoder: - return ContrastiveSearchEncoderDecoderOutput( + return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, encoder_attentions=encoder_attentions, @@ -2417,7 +2149,7 @@ def contrastive_search( past_key_values=model_kwargs.get("past_key_values"), ) else: - return ContrastiveSearchDecoderOnlyOutput( + return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, attentions=decoder_attentions, @@ -2442,7 +2174,7 @@ def greedy_search( synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -2493,10 +2225,10 @@ def greedy_search( If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -2667,7 +2399,7 @@ def greedy_search( if return_dict_in_generate: if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( + return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, encoder_attentions=encoder_attentions, @@ -2678,7 +2410,7 @@ def greedy_search( past_key_values=model_kwargs.get("past_key_values"), ) else: - return GreedySearchDecoderOnlyOutput( + return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, attentions=decoder_attentions, @@ -2704,7 +2436,7 @@ def sample( synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -2757,10 +2489,10 @@ def sample( an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -2951,7 +2683,7 @@ def sample( if return_dict_in_generate: if self.config.is_encoder_decoder: - return SampleEncoderDecoderOutput( + return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, encoder_attentions=encoder_attentions, @@ -2962,7 +2694,7 @@ def sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return SampleDecoderOnlyOutput( + return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, attentions=decoder_attentions, @@ -3013,7 +2745,7 @@ def beam_search( return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: + ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **beam search decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -3062,10 +2794,10 @@ def beam_search( an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or + [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. @@ -3304,7 +3036,7 @@ def beam_search( sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( + return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -3317,7 +3049,7 @@ def beam_search( past_key_values=model_kwargs.get("past_key_values"), ) else: - return BeamSearchDecoderOnlyOutput( + return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -3345,7 +3077,7 @@ def beam_sample( return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, **model_kwargs, - ) -> Union[BeamSampleOutput, torch.LongTensor]: + ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **beam search multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -3398,10 +3130,10 @@ def beam_sample( an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.BeamSampleDecoderOnlyOutput`], [`~generation.BeamSampleEncoderDecoderOutput`] or + [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSampleEncoderDecoderOutput`] if + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -3641,7 +3373,7 @@ def beam_sample( sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: - return BeamSampleEncoderDecoderOutput( + return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -3654,7 +3386,7 @@ def beam_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return BeamSampleDecoderOnlyOutput( + return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -3731,11 +3463,11 @@ def group_beam_search( model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or + [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if [`~generation.BeamSearchDecoderOnlyOutput`] if - `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a - [`~generation.BeamSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. Examples: @@ -4026,7 +3758,7 @@ def group_beam_search( sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( + return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -4039,7 +3771,7 @@ def group_beam_search( past_key_values=model_kwargs.get("past_key_values"), ) else: - return BeamSearchDecoderOnlyOutput( + return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -4066,7 +3798,7 @@ def constrained_beam_search( return_dict_in_generate: Optional[bool] = None, synced_gpus: Optional[bool] = None, **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: + ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **constrained beam search decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -4120,10 +3852,10 @@ def constrained_beam_search( an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or + [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. @@ -4369,7 +4101,7 @@ def constrained_beam_search( if not output_scores: sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( + return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -4382,7 +4114,7 @@ def constrained_beam_search( past_key_values=model_kwargs.get("past_key_values"), ) else: - return BeamSearchDecoderOnlyOutput( + return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, @@ -4412,7 +4144,7 @@ def assisted_decoding( synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, **model_kwargs, - ): + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **greedy decoding** or **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a @@ -4474,10 +4206,10 @@ def assisted_decoding( If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -4758,7 +4490,7 @@ def assisted_decoding( if return_dict_in_generate: if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( + return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, encoder_attentions=encoder_attentions, @@ -4769,7 +4501,7 @@ def assisted_decoding( past_key_values=model_kwargs.get("past_key_values"), ) else: - return GreedySearchDecoderOnlyOutput( + return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, attentions=decoder_attentions, diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index bfd459841d50af..a60159c7a003f2 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1197,18 +1197,14 @@ def generate( If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchDecoderOnlyOutput`], - - [`~generation.SampleDecoderOnlyOutput`], - - [`~generation.BeamSearchDecoderOnlyOutput`], - - [`~generation.BeamSampleDecoderOnlyOutput`] + - [`~generation.GenerateDecoderOnlyOutput`], + - [`~generation.GenerateBeamDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects if generation_config is None: @@ -2244,18 +2240,14 @@ def generate( If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchDecoderOnlyOutput`], - - [`~generation.SampleDecoderOnlyOutput`], - - [`~generation.BeamSearchDecoderOnlyOutput`], - - [`~generation.BeamSampleDecoderOnlyOutput`] + - [`~generation.GenerateDecoderOnlyOutput`], + - [`~generation.GenerateBeamDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects if generation_config is None: diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index d9f9ee3aa11ae8..d3638d25b97a0d 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -1264,10 +1264,8 @@ def generate( or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ if generation_config is None: diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index 8da05bc9b16609..4410a18bd1bcbe 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -2845,11 +2845,8 @@ def generate( [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible [`~utils.ModelOutput`] types are: - - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ # prepare text_decoder_input_ids text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) @@ -3134,11 +3131,8 @@ def generate( [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible [`~utils.ModelOutput`] types are: - - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 6713be99aaa0ad..62d6c9d55d38c2 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -3110,11 +3110,8 @@ def generate( [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible [`~utils.ModelOutput`] types are: - - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ # prepare text_decoder_input_ids text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) @@ -3409,11 +3406,8 @@ def generate( [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible [`~utils.ModelOutput`] types are: - - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] """ text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 6e016517d8b6e8..a3550c791c7318 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1968,10 +1968,8 @@ def generate( else if the passed input is <= 30 seconds / >= 3000 mel input features, the possible [`~utils.ModelOutput`] types are: - - [`~generation.GreedySearchEncoderDecoderOutput`], - - [`~generation.SampleEncoderDecoderOutput`], - - [`~generation.BeamSearchEncoderDecoderOutput`], - - [`~generation.BeamSampleEncoderDecoderOutput`] + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] else only the generated output sequence ids are returned. diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index bfae5e882778a8..c41bc3b21a4ee3 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -65,6 +65,10 @@ DisjunctiveConstraint, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, + GenerateBeamDecoderOnlyOutput, + GenerateBeamEncoderDecoderOutput, + GenerateDecoderOnlyOutput, + GenerateEncoderDecoderOutput, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput, HammingDiversityLogitsProcessor, @@ -730,9 +734,15 @@ def test_greedy_generate_dict_outputs(self): ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_greedy, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput) self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput) else: + self.assertIsInstance(output_greedy, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput) self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) @@ -848,9 +858,15 @@ def test_sample_generate_dict_output(self): ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_sample, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_sample, SampleEncoderDecoderOutput) self.assertIsInstance(output_generate, SampleEncoderDecoderOutput) else: + self.assertIsInstance(output_sample, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_sample, SampleDecoderOnlyOutput) self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) @@ -952,9 +968,15 @@ def test_beam_search_generate_dict_output(self): return_dict_in_generate=True, ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_beam_search, GenerateBeamEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_search, BeamSearchEncoderDecoderOutput) self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: + self.assertIsInstance(output_beam_search, GenerateBeamDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_search, BeamSearchDecoderOnlyOutput) self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) @@ -1109,9 +1131,15 @@ def test_beam_sample_generate_dict_output(self): ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_beam_sample, GenerateBeamEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_sample, BeamSampleEncoderDecoderOutput) self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput) else: + self.assertIsInstance(output_beam_sample, GenerateBeamDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_sample, BeamSampleDecoderOnlyOutput) self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) @@ -1238,9 +1266,15 @@ def test_group_beam_search_generate_dict_output(self): return_dict_in_generate=True, ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_group_beam_search, GenerateBeamEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_group_beam_search, BeamSearchEncoderDecoderOutput) self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: + self.assertIsInstance(output_group_beam_search, GenerateBeamDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_group_beam_search, BeamSearchDecoderOnlyOutput) self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) @@ -1390,9 +1424,15 @@ def test_constrained_beam_search_generate_dict_output(self): ) if model.config.is_encoder_decoder: + self.assertIsInstance(output_beam_search, GenerateBeamEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_search, BeamSearchEncoderDecoderOutput) self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: + self.assertIsInstance(output_beam_search, GenerateBeamDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) + # Retrocompatibility check self.assertIsInstance(output_beam_search, BeamSearchDecoderOnlyOutput) self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 5e1d9ccdf29839..b7952d27a71592 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -53,12 +53,10 @@ set_seed, ) from transformers.generation import ( - GreedySearchDecoderOnlyOutput, - GreedySearchEncoderDecoderOutput, + GenerateDecoderOnlyOutput, + GenerateEncoderDecoderOutput, InfNanRemoveLogitsProcessor, LogitsProcessorList, - SampleDecoderOnlyOutput, - SampleEncoderDecoderOutput, ) @@ -282,8 +280,8 @@ def test_greedy_generate_dict_outputs(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput) - self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) + self.assertIsInstance(output_greedy, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) self.assertNotIn(config.pad_token_id, output_generate) @@ -308,8 +306,8 @@ def test_greedy_generate_dict_outputs_use_cache(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput) - self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) + self.assertIsInstance(output_greedy, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) # override since we don't expect the outputs of `.generate` and `.sample` to be the same, since we perform # additional post-processing in the former @@ -376,8 +374,8 @@ def test_sample_generate_dict_output(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_sample, SampleDecoderOnlyOutput) - self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) + self.assertIsInstance(output_sample, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) def test_greedy_generate_stereo_outputs(self): for model_class in self.greedy_sample_model_classes: @@ -395,8 +393,8 @@ def test_greedy_generate_stereo_outputs(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput) - self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) + self.assertIsInstance(output_greedy, GenerateDecoderOnlyOutput) + self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) self.assertNotIn(config.pad_token_id, output_generate) @@ -1001,8 +999,8 @@ def test_greedy_generate_dict_outputs(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput) - self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput) + self.assertIsInstance(output_greedy, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) self.assertNotIn(config.pad_token_id, output_generate) @@ -1026,8 +1024,8 @@ def test_greedy_generate_dict_outputs_use_cache(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput) - self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput) + self.assertIsInstance(output_greedy, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) def test_sample_generate(self): for model_class in self.greedy_sample_model_classes: @@ -1092,8 +1090,8 @@ def test_sample_generate_dict_output(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_sample, SampleEncoderDecoderOutput) - self.assertIsInstance(output_generate, SampleEncoderDecoderOutput) + self.assertIsInstance(output_sample, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) def test_generate_without_input_ids(self): config, _, _, _, max_length = self._get_input_ids_and_config() @@ -1141,8 +1139,8 @@ def test_greedy_generate_stereo_outputs(self): return_dict_in_generate=True, ) - self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput) - self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput) + self.assertIsInstance(output_greedy, GenerateEncoderDecoderOutput) + self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) self.assertNotIn(config.pad_token_id, output_generate)