From 7b06473b8f6b7c440c65459f3dc0a2f2454c91e7 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:24:01 +0200 Subject: [PATCH 001/385] avoid many failures for ImageGPT (#34071) * skip * [run-slow] imagegpt * skip * [run-slow] imagegpt * [run-slow] imagegpt,video_llava * skip * [run-slow] imagegpt,video_llava --------- Co-authored-by: ydshieh --- tests/models/imagegpt/test_modeling_imagegpt.py | 6 ++++++ tests/models/video_llava/test_modeling_video_llava.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index d8ceed6885f0c8..079726755289fe 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -257,6 +257,12 @@ def _check_scores(self, batch_size, scores, length, config): self.assertEqual(len(scores), length) self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores)) + @unittest.skip( + reason="After #33632, this test still passes, but many subsequential tests fail with `device-side assert triggered`" + ) + def test_beam_search_generate_dict_outputs_use_cache(self): + pass + def setUp(self): self.model_tester = ImageGPTModelTester(self) self.config_tester = ConfigTester(self, config_class=ImageGPTConfig, n_embd=37) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index df8fe0b5dca2bf..492dcb9bae1f92 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -237,6 +237,9 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip( + reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`" + ) def test_mixed_input(self): config, inputs = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: From 3a24ba82ad570e58e100b3739babddd7eaad419e Mon Sep 17 00:00:00 2001 From: Dmytro Mishkin Date: Fri, 11 Oct 2024 15:35:55 +0200 Subject: [PATCH 002/385] Fix NaNs in cost_matrix for mask2former (#34074) Fix NaNs in cost_matrix Sometimes that happens :( --- src/transformers/models/mask2former/modeling_mask2former.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 6b94caf355d994..f4aea415adf5e6 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -474,6 +474,7 @@ def forward( # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible`` cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10)) cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10)) + cost_matrix = torch.nan_to_num(cost_matrix, 0) # do the assigmented using the hungarian algorithm in scipy assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu()) indices.append(assigned_indices) From fd70464fa74c101ab3bc60e0c3db7d5e3b75fe90 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 11 Oct 2024 15:41:46 +0200 Subject: [PATCH 003/385] Fix flaky tests (#34069) * fix mllama only * allow image token index --- src/transformers/models/mllama/modeling_mllama.py | 2 +- utils/check_config_attributes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 0bc77eaeec3324..e486e149e3e660 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -2214,7 +2214,7 @@ def prepare_inputs_for_generation( # If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios # to compute image hidden states, otherwise they are cached within each cross attn layer - if (input_ids == self.config.image_token_index).any(): + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["aspect_ratio_ids"] = aspect_ratio_ids model_inputs["aspect_ratio_mask"] = aspect_ratio_mask diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 7bd937963651d5..83fe07fef2eda0 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -243,6 +243,7 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s "pad_index", "unk_index", "mask_index", + "image_token_index", # for VLMs "image_size", "use_cache", "out_features", From 37ac07853539216baef9abadfda5782e7b20dc87 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 11 Oct 2024 16:11:18 +0100 Subject: [PATCH 004/385] Generate: move `prepare_inputs_for_generation` in encoder-decoder llms (#34048) --- src/transformers/generation/utils.py | 13 ++++--- src/transformers/models/bart/modeling_bart.py | 39 ------------------- .../modeling_bigbird_pegasus.py | 39 ------------------- .../models/blenderbot/modeling_blenderbot.py | 37 ------------------ .../modeling_blenderbot_small.py | 37 ------------------ .../models/blip/modeling_blip_text.py | 2 + src/transformers/models/fsmt/modeling_fsmt.py | 26 +------------ src/transformers/models/led/modeling_led.py | 30 -------------- .../models/longt5/modeling_longt5.py | 36 ----------------- .../models/m2m_100/modeling_m2m_100.py | 37 ------------------ .../models/marian/modeling_marian.py | 39 +------------------ .../models/mbart/modeling_mbart.py | 37 ------------------ src/transformers/models/mt5/modeling_mt5.py | 39 ------------------- src/transformers/models/mvp/modeling_mvp.py | 37 ------------------ .../models/nllb_moe/modeling_nllb_moe.py | 38 ------------------ .../models/pegasus/modeling_pegasus.py | 37 ------------------ .../models/pegasus_x/modeling_pegasus_x.py | 31 --------------- .../models/plbart/modeling_plbart.py | 39 +------------------ .../models/prophetnet/modeling_prophetnet.py | 29 -------------- src/transformers/models/rag/modeling_rag.py | 2 + .../modeling_switch_transformers.py | 39 ------------------- src/transformers/models/t5/modeling_t5.py | 38 ------------------ src/transformers/models/umt5/modeling_umt5.py | 39 ------------------- .../models/zamba/modeling_zamba.py | 2 + tests/generation/test_utils.py | 32 +++++++++++++++ 25 files changed, 49 insertions(+), 725 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 5da4878513eb22..2225b033aa0a9e 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -387,13 +387,14 @@ def prepare_inputs_for_generation( input_ids = input_ids[:, cache_position] # 3. Prepare base model inputs + input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs["input_ids"] = None + if inputs_embeds is not None and not self.config.is_encoder_decoder and cache_position[0] == 0: + model_inputs[input_ids_key] = None model_inputs["inputs_embeds"] = inputs_embeds else: # `clone` calls in this function ensure a consistent stride. See #32227 - model_inputs["input_ids"] = input_ids.clone(memory_format=torch.contiguous_format) + model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format) model_inputs["inputs_embeds"] = None # 4. Create missing `position_ids` on the fly @@ -421,8 +422,8 @@ def prepare_inputs_for_generation( batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape device = model_inputs["inputs_embeds"].device else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device + batch_size, sequence_length = model_inputs[input_ids_key].shape + device = model_inputs[input_ids_key].device # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create # the 4D causal mask exists, it should be present in the base model (XXXModel class). @@ -455,6 +456,8 @@ def prepare_inputs_for_generation( if key not in model_inputs: model_inputs[key] = value + # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples) + model_inputs.pop("labels", None) return model_inputs def _prepare_model_inputs( diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 822be354fb9da0..07c1fa622ea3b6 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -1682,45 +1682,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - decoder_attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index e26dce1edfc20f..19540a7498f5bd 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2561,45 +2561,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - decoder_attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index ae37f546e510db..5c4fdfb472c37e 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -1333,43 +1333,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 93298c4e80e55b..6f79d2a7d005cc 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -1285,43 +1285,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 78384e6ce2f74b..5ee7ae21f9d549 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -915,6 +915,8 @@ def forward( ) def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + # Overwrite -- hardcoded key return (`is_decoder=True`) + input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 4d50f9bb5925b4..3f865c037c01b6 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -1191,7 +1191,7 @@ def __init__(self, config: FSMTConfig): @add_end_docstrings(FSMT_GENERATION_EXAMPLE) def forward( self, - input_ids: torch.LongTensor, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.BoolTensor] = None, @@ -1263,30 +1263,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id) diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index f96bfd82b52638..ee1ad90bfceaa2 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2437,36 +2437,6 @@ def forward( encoder_global_attentions=outputs.encoder_global_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - global_attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "global_attention_mask": global_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 8f9385c0fe76ed..d351e798ac7f88 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -2085,42 +2085,6 @@ def forward( encoder_attentions=encoder_outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index 1588aa28aa2dbf..cc35a3504255bf 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -1621,43 +1621,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index bbb3381bd97325..2d7c7d85daed64 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -16,7 +16,7 @@ import copy import math -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np import torch @@ -1438,43 +1438,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids: torch.LongTensor, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - encoder_outputs: Optional[Union[Tuple[torch.Tensor], BaseModelOutput]] = None, - **kwargs, - ) -> Dict: - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index a10d62d6dcc338..95cd7c65ef32c2 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1647,43 +1647,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id) diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 6a7406f11b5b56..9051414d7414fa 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -1820,45 +1820,6 @@ def forward( encoder_attentions=encoder_outputs.attentions, ) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - decoder_attention_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "decoder_attention_mask": decoder_attention_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 5a466c0cec012d..f68a4bb76b3e71 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -1475,43 +1475,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index cedefc4f4642f7..9c095be16506e8 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -1762,44 +1762,6 @@ def _unpack_router_logits(self, router_outputs): total_expert_indexes = torch.stack(total_expert_indexes, dim=1) if len(total_expert_indexes) > 0 else None return total_router_logits, total_expert_indexes - # Copied from transfomers.models.switch_transformers.SwitchTransformersForConditionalGeneration.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 35f91ca7356611..a737ef14d647cf 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -1390,43 +1390,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 77c0b32e6433c4..f90a8d2deb2651 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1588,37 +1588,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 4f6984a7bef638..490fefc686a524 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -16,7 +16,7 @@ import copy import math -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint @@ -1372,43 +1372,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids: torch.LongTensor, - past_key_values: Optional[List[torch.FloatTensor]] = None, - attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - encoder_outputs: Optional[List[torch.FloatTensor]] = None, - **kwargs, # TODO: Check if this is needed. It is unused? - ) -> Dict[str, Any]: - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id) diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 003e4f15d2d977..137bd5ad828df5 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -2018,35 +2018,6 @@ def _compute_loss(self, logits, labels, ignore_index=-100): return loss - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation." - - if past_key_values: - decoder_input_ids = decoder_input_ids[:, -1:] - # first step, decoder_cached_states are empty - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index bc375b68e947ab..5e6f13ca478f32 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -1172,6 +1172,8 @@ def prepare_inputs_for_generation( n_docs=None, **kwargs, ): + # Overwritten -- `do_marginalize` is explicitly set in the output + if past_key_values is not None: # if past is defined use only last decoder_input_ids decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index f1495ddc8c0057..c39e85bacdd3d1 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -1702,45 +1702,6 @@ def _unpack_router_logits(self, router_outputs): total_expert_indexes.append(expert_indexes) return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - output_router_logits = kwargs.get("output_router_logits", True) - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - "output_router_logits": output_router_logits, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 43e3f3afa4a837..91596f013ab4f5 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1791,44 +1791,6 @@ def forward( encoder_attentions=encoder_outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - decoder_attention_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "decoder_attention_mask": decoder_attention_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index a7d1e5bacc65e5..bd621fc2fb3ac2 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -1302,45 +1302,6 @@ def forward( encoder_attentions=encoder_outputs.attentions, ) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - decoder_attention_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "decoder_attention_mask": decoder_attention_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 2363ed04959d00..81326c07d6cce9 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1519,6 +1519,8 @@ def prepare_inputs_for_generation( use_cache=True, **kwargs, ): + # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + empty_past_kv = past_key_values is None # Omit tokens covered by past_key_values diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 1727aed1117bc6..02f4f1b6127ab5 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -3841,6 +3841,38 @@ def test_prepare_inputs_for_generation_decoder_llm(self): self.assertTrue(model_inputs["input_ids"] is not None) self.assertTrue(model_inputs["inputs_embeds"] is None) + def test_prepare_inputs_for_generation_encoder_decoder_llm(self): + """ + Same as `test_prepare_inputs_for_generation_decoder_llm` but for encoder-decoder models. Main difference: we + should look for `decoder_input_ids`, instead of `input_ids`. + """ + model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5") + model = model.to(torch_device) + + # 1. Sanity check: the model's `prepare_inputs_for_generation` comes from `GenerationMixin` + self.assertTrue("GenerationMixin" in str(model.prepare_inputs_for_generation)) + + # 2. If we pass input ids by themselves, we should get back the same input ids -- with the encoder-decoder key + decoder_input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]).to(torch_device) + model_inputs = model.prepare_inputs_for_generation(decoder_input_ids) + self.assertTrue(torch.all(model_inputs["decoder_input_ids"] == decoder_input_ids)) + + # 3. If we pass the attention mask too, we will get back the attention mask. Encoder-decoder models usually + # don't use `position_ids` + decoder_attention_mask = torch.tensor([[1, 1, 1], [1, 1, 1]]).to(torch_device) + model_inputs = model.prepare_inputs_for_generation( + decoder_input_ids, decoder_attention_mask=decoder_attention_mask + ) + self.assertTrue(torch.all(model_inputs["decoder_attention_mask"] == decoder_attention_mask)) + self.assertTrue("position_ids" not in model_inputs) + + # 4. `use_cache` (and other kwargs, like the encoder outputs) are forwarded + self.assertFalse("use_cache" in model_inputs) # From the previous input, there is no `use_cache` + model_inputs = model.prepare_inputs_for_generation(decoder_input_ids, use_cache=True, encoder_outputs="foo") + self.assertTrue(model_inputs["use_cache"] is True) + self.assertTrue(model_inputs["encoder_outputs"] == "foo") + # See the decoder-only test for more corner cases. The code is the same, so we don't repeat it here. + def test_generate_compile_fullgraph_tiny(self): """ Tests that we can call end-to-end generation with a tiny model (i.e. doesn't crash) From 80bee7b11444a698894b114a923710ab8a772d30 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:41:50 +0200 Subject: [PATCH 005/385] Avoid many test failures for `LlavaNextVideoForConditionalGeneration` (#34070) * skip * [run-slow] llava_next_video * skip * [run-slow] video_llava, llava_next_video * skip * [run-slow] llava_next_video --------- Co-authored-by: ydshieh --- tests/generation/test_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 02f4f1b6127ab5..5d92e8ce216aa1 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1230,6 +1230,9 @@ def test_dola_decoding_sample(self): if any(model_name in model_class.__name__.lower() for model_name in ["marian", "mbart", "pegasus"]): self.skipTest("DoLa is not supported for models that don't return layerwise hidden states") + if any(model_name == model_class.__name__ for model_name in ["LlavaNextVideoForConditionalGeneration"]): + self.skipTest(f"DoLa is failing for {model_class.__name__}") + # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() main_input = inputs_dict[model_class.main_input_name] From 144852fb6bbe584e9ff7d13511180aec42e1b366 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 11 Oct 2024 18:03:29 +0200 Subject: [PATCH 006/385] refactor: benchmarks (#33896) * refactor: benchmarks Based on a discussion with @LysandreJik & @ArthurZucker, the goal of this PR is to improve transformers' benchmark system. This is a WIP, for the moment the infrastructure required to make things work is not ready. Will update the PR description when it is the case. * feat: add db init in benchmarks CI * fix: pg_config is missing in runner * fix: add psql to the runner * fix: connect info from env vars + PR comments * refactor: set database as env var * fix: invalid working directory * fix: `commit_msg` -> `commit_message` * fix: git marking checked out repo as unsafe * feat: add logging * fix: invalid device * feat: update grafana dashboard for prod grafana * feat: add `commit_id` to header table * feat: commit latest version of dashboard * feat: move measurements into json field * feat: remove drop table migration queries * fix: `torch.arrange` -> `torch.arange` * fix: add missing `s` to `cache_position` positional argument * fix: change model * revert: `cache_positions` -> `cache_position` * fix: set device for `StaticCache` * fix: set `StaticCache` dtype * feat: limit max cache len * fix script * raise error on failure! * not try catch * try to skip generate compilation * update * update docker image! * update * update again!@ * update * updates * ??? * ?? * use `torch.cuda.synchronize()` * fix json * nits * fix * fixed! * f**k * feat: add TTNT panels * feat: add try except --------- Co-authored-by: Arthur Zucker --- .github/workflows/benchmark.yml | 73 +- benchmark/grafana_dashboard.json | 2211 ++++++++++++++++++++++++++++++ benchmark/init_db.sql | 26 + benchmark/llama.py | 404 ++++++ benchmark/requirements.txt | 5 + 5 files changed, 2697 insertions(+), 22 deletions(-) create mode 100644 benchmark/grafana_dashboard.json create mode 100644 benchmark/init_db.sql create mode 100644 benchmark/llama.py create mode 100644 benchmark/requirements.txt diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 75a837d693e7c6..c264dfe462aae7 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,43 +1,72 @@ name: Self-hosted runner (benchmark) on: - schedule: - - cron: "17 2 * * *" - workflow_call: + push: + branches: [main] + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true env: HF_HOME: /mnt/cache - TF_FORCE_GPU_ALLOW_GROWTH: true - jobs: benchmark: name: Benchmark - runs-on: + runs-on: group: aws-g5-4xlarge-cache container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + image: huggingface/transformers-pytorch-gpu + options: --gpus all --privileged --ipc host steps: - - name: Update clone - working-directory: /transformers + - name: Get repo + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Get repo + if: github.event_name == 'push' + uses: actions/checkout@v4 + with: + ref: ${{ github.sha }} + + - name: Install libpq-dev & psql run: | - git fetch && git checkout ${{ github.sha }} + apt update + apt install -y libpq-dev postgresql-client + + - name: Install benchmark script dependencies + run: python3 -m pip install -r benchmark/requirements.txt - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" - - name: Benchmark (daily) - if: github.event_name == 'schedule' - working-directory: /transformers + - name: Run database init script run: | - python3 -m pip install optimum-benchmark>=0.3.0 - HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun + psql -f benchmark/init_db.sql + env: + PGDATABASE: metrics + PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} + PGUSER: transformers_benchmarks + PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} - - name: Benchmark (merged to main event) - if: github.event_name == 'push' && github.ref_name == 'main' - working-directory: /transformers + - name: Run benchmark run: | - python3 -m pip install optimum-benchmark>=0.3.0 - HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results_merge_event --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun + git config --global --add safe.directory /__w/transformers/transformers + if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then + commit_id=$(echo "${{ github.event.pull_request.head.sha }}") + elif [ "$GITHUB_EVENT_NAME" = "push" ]; then + commit_id=$GITHUB_SHA + fi + commit_msg=$(git show -s --format=%s | cut -c1-70) + python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" + env: + HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} + PGUSER: transformers_benchmarks + PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json new file mode 100644 index 00000000000000..be471a6314ecc7 --- /dev/null +++ b/benchmark/grafana_dashboard.json @@ -0,0 +1,2211 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": false, + "title": "Go to data", + "tooltip": "Go to data", + "type": "link", + "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" + } + ], + "liveNow": true, + "panels": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "gpu_name" + }, + "properties": [ + { + "id": "custom.width", + "value": 364 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "left" + }, + "properties": [ + { + "id": "custom.width", + "value": 407 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_message" + }, + "properties": [ + { + "id": "custom.width", + "value": 708 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 388 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = ${branch};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "commit_id", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_name", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baaa8aaa-89ab-4cde-b012-31922f96de3f", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "benchmarks" + } + ], + "transparent": true, + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 13, + "panels": [], + "title": "Eager Forward Pass", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 7, + "options": { + "barRadius": 0.05, + "barWidth": 0.3, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First eager forward pass", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second eager forward pass", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 16, + "panels": [], + "title": "Time to next token", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 17, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to first token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 18, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to second token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 19, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to third token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 20, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to subsequent next tokens mean", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 14, + "panels": [], + "title": "Compiled Generate", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 8, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 10, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 11, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Third compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 12, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Fourth compile generate", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 15, + "panels": [], + "title": "Usage metrics", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "CPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "Memory usage", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU memory usage", + "transparent": true, + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": [ + "refactor/benchmarks" + ], + "value": [ + "refactor/benchmarks" + ] + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT DISTINCT branch FROM benchmarks;", + "description": "", + "hide": 0, + "includeAll": false, + "label": "branch", + "multi": false, + "name": "branch", + "options": [], + "query": "SELECT DISTINCT branch FROM benchmarks;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1728570853117", + "value": "1728570853117" + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "StartTime", + "options": [], + "query": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1728657828802", + "value": "1728657828802" + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "EndTime", + "options": [], + "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "NVIDIA A10G", + "value": "NVIDIA A10G" + }, + "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", + "hide": 0, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "gpu_name", + "options": [], + "query": "SELECT DISTINCT gpu_name FROM benchmarks;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "2024-10-11T13:10:01.641Z", + "to": "2024-10-11T13:25:21.783Z" + }, + "timepicker": { + "hidden": false + }, + "timezone": "browser", + "title": "Transformers benchmarks", + "uid": "fdz33iyzln9c0a", + "version": 9, + "weekStart": "" +} diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql new file mode 100644 index 00000000000000..4381b99cea660e --- /dev/null +++ b/benchmark/init_db.sql @@ -0,0 +1,26 @@ +CREATE TABLE IF NOT EXISTS benchmarks ( + benchmark_id SERIAL PRIMARY KEY, + branch VARCHAR(255), + commit_id VARCHAR(72), + commit_message VARCHAR(70), + gpu_name VARCHAR(255), + created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE TABLE IF NOT EXISTS device_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + cpu_util double precision, + mem_megabytes double precision, + gpu_util double precision, + gpu_mem_megabytes double precision, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE TABLE IF NOT EXISTS model_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + measurements jsonb, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + diff --git a/benchmark/llama.py b/benchmark/llama.py new file mode 100644 index 00000000000000..a926f903486607 --- /dev/null +++ b/benchmark/llama.py @@ -0,0 +1,404 @@ +import argparse +import json +import logging +import os +import sys +from statistics import mean +from threading import Event, Thread +from time import perf_counter, sleep +from typing import Optional +import gpustat +import psutil +import psycopg2 +import torch + +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache +from psycopg2.extras import Json +from psycopg2.extensions import register_adapter + + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +os.environ["TOKENIZERS_PARALLELISM"] = "1" +torch.set_float32_matmul_precision("high") +register_adapter(dict, Json) + + +def parse_arguments(): + """ + Parse command line arguments for the benchmarking CLI. + """ + parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") + + parser.add_argument( + "branch", + type=str, + help="The branch name on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_id", + type=str, + help="The commit hash on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_msg", + type=str, + help="The commit message associated with the commit, truncated to 70 characters.", + ) + + args = parser.parse_args() + + return args.branch, args.commit_id, args.commit_msg + + +def collect_metrics(benchmark_id, continue_metric_collection): + p = psutil.Process(os.getpid()) + conn = psycopg2.connect("dbname=metrics") + cur = conn.cursor() + while not continue_metric_collection.is_set(): + with p.oneshot(): + cpu_util = p.cpu_percent() + mem_megabytes = p.memory_info().rss / (1024 * 1024) + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_util = gpu_stats[0]["utilization.gpu"] + gpu_mem_megabytes = gpu_stats[0]["memory.used"] + cur.execute( + "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", + (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + ) + sleep(0.01) + conn.commit() + conn.close() + + +def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + continue_metric_collection = Event() + metrics_thread = None + try: + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_name = gpu_stats[0]["name"] + conn = psycopg2.connect("dbname=metrics") + cur = conn.cursor() + cur.execute( + "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", + (branch, commit_id, commit_msg, gpu_name), + ) + conn.commit() + benchmark_id = cur.fetchone()[0] + metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) + metrics_thread.start() + + os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling + + device = "cuda" + ckpt = "meta-llama/Llama-2-7b-hf" + + # This is to avoid counting download in model load time measurement + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) + gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) + start = perf_counter() + model = AutoModelForCausalLM.from_pretrained( + ckpt, torch_dtype=torch.float16, generation_config=gen_config + ).eval() + model.to(device) + torch.cuda.synchronize() + end = perf_counter() + model_load_time = end - start + logger.info(f"loaded model in: {model_load_time}s") + + tokenizer = AutoTokenizer.from_pretrained(ckpt) + + prompt = "Why dogs are so cute?" + inputs = tokenizer(prompt, return_tensors="pt").to(device) + + # Specify the max length (including both the prompt and the response) + # When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object + # with sequence length = `max_length`. The longer the more you will re-use it + seq_length = inputs["input_ids"].shape[1] + model.generation_config.max_length = seq_length + num_tokens_to_generate + batch_size = inputs["input_ids"].shape[0] + + # Copied from the gpt-fast repo + def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None): + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None): + probs = logits_to_probs(logits[:, -1], temperature, top_k) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + def decode_one_token(model, cur_token, cache_position, past_key_values): + logits = model( + cur_token, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + )[0] + new_token = sample(logits, temperature=0.6, top_k=5)[0] + return new_token + + ######### + # Eager # + ######### + with torch.no_grad(): + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + first_eager_fwd_pass_time = end - start + logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s") + start = perf_counter() + output = model.generate(**inputs, do_sample=False) + end = perf_counter() + first_eager_generate_time = end - start + logger.info(f"completed first eager generation in: {first_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + second_eager_fwd_pass_time = end - start + logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s") + start = perf_counter() + model.generate(**inputs, do_sample=False) + end = perf_counter() + second_eager_generate_time = end - start + logger.info(f"completed second eager generation in: {second_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + torch.compiler.reset() + + ################ + # Forward pass # + ################ + + # `torch.compile(model, ...)` is not recommended as you compile callbacks + # and full generate. We recommend compiling only the forward for now. + # "reduce-overhead" will use cudagraphs. + generated_ids = torch.zeros( + (batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device + ) + + generated_ids[:, :seq_length] = inputs["input_ids"] + decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True) + # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + # TODO use decode_one_token(model, input_id.clone(), cache_position) for verification + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate + 10, + ) + cache_position = torch.arange(seq_length, device=device) + all_generated_tokens = [] + ### First compile, prefill + start = perf_counter() + next_token = decode_one_token( + model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_first_token = end - start + logger.info(f"completed first compile generation in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + cache_position = torch.tensor([seq_length], device=device) + ### First compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_second_token = end - start + logger.info(f"completed second compile generation in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + ### Second compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_third_token = end - start + logger.info(f"completed third compile forward in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + ### Using cuda graphs decoding + + start = perf_counter() + for _ in range(1, num_tokens_to_generate): + all_generated_tokens += next_token.clone().detach().cpu().tolist() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + cache_position += 1 + torch.cuda.synchronize() + end = perf_counter() + mean_time_to_next_token = (end - start) / num_tokens_to_generate + logger.info(f"completed next compile generation in: {mean_time_to_next_token}s") + logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}") + + #################### + # Generate compile # + #################### + torch.compiler.reset() + # we will not compile full generate as it' s to intensive, tho we measure full forward! + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 1st call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + first_compile_generate_time = end - start + logger.info(f"completed first compile generation in: {first_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 2nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + second_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {second_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 3nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + third_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {third_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 4th call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + fourth_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + cur.execute( + """ + INSERT INTO model_measurements ( + benchmark_id, + measurements + ) VALUES (%s, %s) + """, + ( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ), + ) + conn.commit() + conn.close() + except Exception as e: + logger.error(f"Caught exception: {e}") + continue_metric_collection.set() + if metrics_thread is not None: + metrics_thread.join() + + +if __name__ == "__main__": + branch, commit_id, commit_msg = parse_arguments() + run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20) diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt new file mode 100644 index 00000000000000..50e9dfaddfa4ca --- /dev/null +++ b/benchmark/requirements.txt @@ -0,0 +1,5 @@ +gpustat==1.1.1 +psutil==6.0.0 +psycopg2==2.9.9 +torch>=2.4.0 +hf_transfer \ No newline at end of file From 617b21273a349bd3a94e2b3bfb83f8089f45749b Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 11 Oct 2024 19:52:06 +0200 Subject: [PATCH 007/385] fix(ci): benchmarks dashboard was failing due to missing quotations (#34100) --- benchmark/grafana_dashboard.json | 68 ++++++++++++++++---------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json index be471a6314ecc7..2375663ffbc6db 100644 --- a/benchmark/grafana_dashboard.json +++ b/benchmark/grafana_dashboard.json @@ -148,7 +148,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = ${branch};", + "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = '${branch}';", "refId": "A", "sql": { "columns": [ @@ -283,7 +283,7 @@ "id": 7, "options": { "barRadius": 0.05, - "barWidth": 0.3, + "barWidth": 0.8, "fullHighlight": false, "groupWidth": 0.7, "legend": { @@ -312,7 +312,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -424,7 +424,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -545,7 +545,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -653,7 +653,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -761,7 +761,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -869,7 +869,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -990,7 +990,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", "refId": "A", "sql": { "columns": [ @@ -1098,7 +1098,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", "refId": "A", "sql": { "columns": [ @@ -1206,7 +1206,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", "refId": "A", "sql": { "columns": [ @@ -1314,7 +1314,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", "refId": "A", "sql": { "columns": [ @@ -1442,7 +1442,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", "refId": "A", "sql": { "columns": [ @@ -1627,7 +1627,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", "refId": "A", "sql": { "columns": [ @@ -1812,7 +1812,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch}", + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}'", "refId": "A", "sql": { "columns": [ @@ -1997,7 +1997,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", "refId": "A", "sql": { "columns": [ @@ -2099,13 +2099,9 @@ "list": [ { "current": { - "selected": true, - "text": [ - "refactor/benchmarks" - ], - "value": [ - "refactor/benchmarks" - ] + "selected": false, + "text": "main", + "value": "main" }, "datasource": { "type": "grafana-postgresql-datasource", @@ -2129,21 +2125,21 @@ { "current": { "selected": false, - "text": "1728570853117", - "value": "1728570853117" + "text": "1728662868776", + "value": "1728662868776" }, "datasource": { "type": "grafana-postgresql-datasource", "uid": "de0dbhs18ho1sc" }, - "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", "description": "", "hide": 2, "includeAll": false, "multi": false, "name": "StartTime", "options": [], - "query": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "query": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -2153,21 +2149,21 @@ { "current": { "selected": false, - "text": "1728657828802", - "value": "1728657828802" + "text": "1728663254125", + "value": "1728663254125" }, "datasource": { "type": "grafana-postgresql-datasource", "uid": "de0dbhs18ho1sc" }, - "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", "description": "", "hide": 2, "includeAll": false, "multi": false, "name": "EndTime", "options": [], - "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -2180,6 +2176,10 @@ "text": "NVIDIA A10G", "value": "NVIDIA A10G" }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", "hide": 0, "includeAll": false, @@ -2197,8 +2197,8 @@ ] }, "time": { - "from": "2024-10-11T13:10:01.641Z", - "to": "2024-10-11T13:25:21.783Z" + "from": "now-1h", + "to": "now" }, "timepicker": { "hidden": false @@ -2206,6 +2206,6 @@ "timezone": "browser", "title": "Transformers benchmarks", "uid": "fdz33iyzln9c0a", - "version": 9, + "version": 11, "weekStart": "" } From 37ea04013b34b39c01b51aeaacd8d56f2c62a7eb Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Sat, 12 Oct 2024 16:45:52 +0100 Subject: [PATCH 008/385] Generate: Fix modern llm `generate` calls with `synced_gpus` (#34095) --- src/transformers/generation/utils.py | 134 +++++++++++++-------------- 1 file changed, 63 insertions(+), 71 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 2225b033aa0a9e..68b8b598ec0978 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -379,9 +379,10 @@ def prepare_inputs_for_generation( # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None: # Exception 1 + if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 or Exception 3 input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) input_ids = input_ids[:, cache_position] @@ -2609,8 +2610,14 @@ def _dola_decoding( outputs.hidden_states[candidate_premature_layer][:, -1, :] ).to(final_logits.device) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need + continue next_token_logits = _dola_select_contrast( candidate_premature_layers, candidate_premature_logits, final_logits @@ -2652,11 +2659,6 @@ def _dola_decoding( input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) if streamer is not None: streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) # stop when each sentence is finished unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) @@ -3016,8 +3018,14 @@ def _contrastive_search( ) # contrastive_search main logic end + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need + continue # finished sentences should have their next token be a padding token if has_eos_stopping_criteria: @@ -3027,11 +3035,6 @@ def _contrastive_search( input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) if streamer is not None: streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) # stop when each sentence is finished unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) @@ -3168,8 +3171,14 @@ def _sample( # forward pass to get next token outputs = self(**model_inputs, return_dict=True) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need + continue # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) @@ -3214,11 +3223,6 @@ def _sample( input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) if streamer is not None: streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 @@ -3415,9 +3419,15 @@ def _beam_search( else: # Unchanged original behavior outputs = self(**model_inputs, return_dict=True) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need + continue # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) @@ -3491,12 +3501,6 @@ def _beam_search( input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - # This is needed to properly delete outputs.logits which may be very large for first iteration # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory @@ -3670,9 +3674,15 @@ def _group_beam_search( outputs = self(**model_inputs, return_dict=True) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need + continue if output_scores: processed_score = torch.zeros_like(outputs.logits[:, -1, :]) @@ -3782,12 +3792,6 @@ def _group_beam_search( input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - # This is needed to properly delete outputs.logits which may be very large for first iteration # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory @@ -3948,9 +3952,15 @@ def _constrained_beam_search( outputs = self(**model_inputs, return_dict=True) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need + continue # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) @@ -4018,11 +4028,6 @@ def _constrained_beam_search( beam_idx = beam_outputs["next_beam_indices"] input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) # This is needed to properly delete outputs.logits which may be very large for first iteration # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration @@ -4162,17 +4167,8 @@ def _assisted_decoding( unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) - # This is needed if return_dict_in_generate is True - start_from_empty_dynamic_cache = False - past_key_values = model_kwargs.get("past_key_values", None) - if isinstance(past_key_values, DynamicCache) or ( - isinstance(past_key_values, EncoderDecoderCache) - and isinstance(past_key_values.self_attention_cache, DynamicCache) - ): - if past_key_values.get_seq_length() == 0: - start_from_empty_dynamic_cache = True - this_peer_finished = False + is_first_iteration = True # to preserve the same API in the output as other generation methods while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): cur_len = input_ids.shape[-1] @@ -4271,34 +4267,36 @@ def _assisted_decoding( # 5. Update the candidate generation strategy if needed candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches) + # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + num_new_tokens=n_matches + 1, + ) if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need + continue # Store scores, attentions and hidden_states when required # Assistant: modified to append one tuple element per token, as in the other generation methods. if return_dict_in_generate: + newly_added_length = n_matches + 1 if output_scores: - scores += tuple(new_logits[:, i, :] for i in range(n_matches + 1)) + scores += tuple(new_logits[:, i, :] for i in range(newly_added_length)) if output_logits: - raw_logits += (next_token_logits,) - - if "past_key_values" not in model_kwargs or start_from_empty_dynamic_cache: - added_len = new_cur_len - # set it to false for other iterations - start_from_empty_dynamic_cache = False - else: - added_len = n_matches + 1 + raw_logits += tuple(next_token_logits[:, i, :] for i in range(newly_added_length)) + newly_added_length = new_cur_len if is_first_iteration else newly_added_length if output_attentions: if self.config.is_encoder_decoder: cross_attentions = _split_model_outputs( - cross_attentions, outputs.cross_attentions, cur_len, added_len + cross_attentions, outputs.cross_attentions, cur_len, newly_added_length ) decoder_attentions = _split_model_outputs( decoder_attentions, outputs.decoder_attentions, cur_len, - added_len, + newly_added_length, is_decoder_attention=True, ) else: @@ -4306,28 +4304,22 @@ def _assisted_decoding( decoder_attentions, outputs.attentions, cur_len, - added_len, + newly_added_length, is_decoder_attention=True, ) if output_hidden_states: if self.config.is_encoder_decoder: decoder_hidden_states = _split_model_outputs( - decoder_hidden_states, outputs.decoder_hidden_states, cur_len, added_len + decoder_hidden_states, outputs.decoder_hidden_states, cur_len, newly_added_length ) else: decoder_hidden_states = _split_model_outputs( - decoder_hidden_states, outputs.hidden_states, cur_len, added_len + decoder_hidden_states, outputs.hidden_states, cur_len, newly_added_length ) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - num_new_tokens=n_matches + 1, - ) - unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 + is_first_iteration = False if streamer is not None: streamer.end() From 7434c0ed21a154136b0145b0245ae9058005abac Mon Sep 17 00:00:00 2001 From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> Date: Mon, 14 Oct 2024 08:53:32 +0200 Subject: [PATCH 009/385] Mistral-related models for QnA (#34045) * mistral qna start * mixtral qna * oops * qwen2 qna * qwen2moe qna * add missing input embed methods * add copied to all methods, can't directly from llama due to the prefix * make top level copied from --- docs/source/en/model_doc/mistral.md | 5 + docs/source/en/model_doc/mixtral.md | 4 + docs/source/en/model_doc/qwen2.md | 5 + docs/source/en/model_doc/qwen2_moe.md | 5 + src/transformers/__init__.py | 8 ++ src/transformers/models/auto/modeling_auto.py | 4 + src/transformers/models/mistral/__init__.py | 2 + .../models/mistral/modeling_mistral.py | 101 ++++++++++++++++++ src/transformers/models/mixtral/__init__.py | 2 + .../models/mixtral/modeling_mixtral.py | 101 ++++++++++++++++++ src/transformers/models/qwen2/__init__.py | 2 + .../models/qwen2/modeling_qwen2.py | 101 ++++++++++++++++++ src/transformers/models/qwen2_moe/__init__.py | 2 + .../models/qwen2_moe/modeling_qwen2_moe.py | 101 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 28 +++++ tests/models/mistral/test_modeling_mistral.py | 10 +- tests/models/mixtral/test_modeling_mixtral.py | 10 +- tests/models/qwen2/test_modeling_qwen2.py | 10 +- .../qwen2_moe/test_modeling_qwen2_moe.py | 10 +- 19 files changed, 507 insertions(+), 4 deletions(-) diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 17ce15b2b8c9b9..2be657109a8d46 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -208,6 +208,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MistralForTokenClassification - forward +## MistralForQuestionAnswering + +[[autodoc]] MistralForQuestionAnswering +- forward + ## FlaxMistralModel [[autodoc]] FlaxMistralModel diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 71c7d7921ef005..7afcaa798ecac4 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -209,3 +209,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MixtralForTokenClassification - forward + +## MixtralForQuestionAnswering +[[autodoc]] MixtralForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 16815f2fc1f3cd..78138413c7fb3a 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -85,3 +85,8 @@ In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inferenc [[autodoc]] Qwen2ForTokenClassification - forward + +## Qwen2ForQuestionAnswering + +[[autodoc]] Qwen2ForQuestionAnswering + - forward diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 9c6dc80beb61e5..3a7391ca194fc5 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -80,3 +80,8 @@ In the following, we demonstrate how to use `Qwen1.5-MoE-A2.7B-Chat` for the inf [[autodoc]] Qwen2MoeForTokenClassification - forward + +## Qwen2MoeForQuestionAnswering + +[[autodoc]] Qwen2MoeForQuestionAnswering + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ab829c6894c0f9..daffe11987ef5d 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2709,6 +2709,7 @@ _import_structure["models.mistral"].extend( [ "MistralForCausalLM", + "MistralForQuestionAnswering", "MistralForSequenceClassification", "MistralForTokenClassification", "MistralModel", @@ -2718,6 +2719,7 @@ _import_structure["models.mixtral"].extend( [ "MixtralForCausalLM", + "MixtralForQuestionAnswering", "MixtralForSequenceClassification", "MixtralForTokenClassification", "MixtralModel", @@ -3094,6 +3096,7 @@ _import_structure["models.qwen2"].extend( [ "Qwen2ForCausalLM", + "Qwen2ForQuestionAnswering", "Qwen2ForSequenceClassification", "Qwen2ForTokenClassification", "Qwen2Model", @@ -3110,6 +3113,7 @@ _import_structure["models.qwen2_moe"].extend( [ "Qwen2MoeForCausalLM", + "Qwen2MoeForQuestionAnswering", "Qwen2MoeForSequenceClassification", "Qwen2MoeForTokenClassification", "Qwen2MoeModel", @@ -7323,6 +7327,7 @@ ) from .models.mistral import ( MistralForCausalLM, + MistralForQuestionAnswering, MistralForSequenceClassification, MistralForTokenClassification, MistralModel, @@ -7330,6 +7335,7 @@ ) from .models.mixtral import ( MixtralForCausalLM, + MixtralForQuestionAnswering, MixtralForSequenceClassification, MixtralForTokenClassification, MixtralModel, @@ -7625,6 +7631,7 @@ ) from .models.qwen2 import ( Qwen2ForCausalLM, + Qwen2ForQuestionAnswering, Qwen2ForSequenceClassification, Qwen2ForTokenClassification, Qwen2Model, @@ -7637,6 +7644,7 @@ ) from .models.qwen2_moe import ( Qwen2MoeForCausalLM, + Qwen2MoeForQuestionAnswering, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification, Qwen2MoeModel, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index aa0d59de52ff4c..dbfcccaa4684dc 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1046,6 +1046,8 @@ ("mbart", "MBartForQuestionAnswering"), ("mega", "MegaForQuestionAnswering"), ("megatron-bert", "MegatronBertForQuestionAnswering"), + ("mistral", "MistralForQuestionAnswering"), + ("mixtral", "MixtralForQuestionAnswering"), ("mobilebert", "MobileBertForQuestionAnswering"), ("mpnet", "MPNetForQuestionAnswering"), ("mpt", "MptForQuestionAnswering"), @@ -1057,6 +1059,8 @@ ("nystromformer", "NystromformerForQuestionAnswering"), ("opt", "OPTForQuestionAnswering"), ("qdqbert", "QDQBertForQuestionAnswering"), + ("qwen2", "Qwen2ForQuestionAnswering"), + ("qwen2_moe", "Qwen2MoeForQuestionAnswering"), ("reformer", "ReformerForQuestionAnswering"), ("rembert", "RemBertForQuestionAnswering"), ("roberta", "RobertaForQuestionAnswering"), diff --git a/src/transformers/models/mistral/__init__.py b/src/transformers/models/mistral/__init__.py index 93e551e193057d..31441efe6527d2 100644 --- a/src/transformers/models/mistral/__init__.py +++ b/src/transformers/models/mistral/__init__.py @@ -35,6 +35,7 @@ else: _import_structure["modeling_mistral"] = [ "MistralForCausalLM", + "MistralForQuestionAnswering", "MistralModel", "MistralPreTrainedModel", "MistralForSequenceClassification", @@ -78,6 +79,7 @@ else: from .modeling_mistral import ( MistralForCausalLM, + MistralForQuestionAnswering, MistralForSequenceClassification, MistralForTokenClassification, MistralModel, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index b0ffe3e56e5972..1bb7a3f109ef9b 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -34,6 +34,7 @@ from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -1408,3 +1409,103 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """ +The Mistral Model transformer with a span classification head on top for extractive question-answering tasks like +SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForQuestionAnswering with Llama->Mistral,LLAMA->MISTRAL,transformer->model +class MistralForQuestionAnswering(MistralPreTrainedModel): + base_model_prefix = "model" + + # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Mistral + def __init__(self, config): + super().__init__(config) + self.model = MistralModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/mixtral/__init__.py b/src/transformers/models/mixtral/__init__.py index b124d41dfbec10..4ee4834dd24984 100644 --- a/src/transformers/models/mixtral/__init__.py +++ b/src/transformers/models/mixtral/__init__.py @@ -33,6 +33,7 @@ else: _import_structure["modeling_mixtral"] = [ "MixtralForCausalLM", + "MixtralForQuestionAnswering", "MixtralModel", "MixtralPreTrainedModel", "MixtralForSequenceClassification", @@ -51,6 +52,7 @@ else: from .modeling_mixtral import ( MixtralForCausalLM, + MixtralForQuestionAnswering, MixtralForSequenceClassification, MixtralForTokenClassification, MixtralModel, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 9c7fadbb8f885c..9bb0654f030ba7 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -35,6 +35,7 @@ from ...modeling_outputs import ( MoeCausalLMOutputWithPast, MoeModelOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -1644,3 +1645,103 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """ +The Mixtral Model transformer with a span classification head on top for extractive question-answering tasks like +SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + MIXTRAL_START_DOCSTRING, +) +# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Mixtral, MISTRAL->MIXTRAL +class MixtralForQuestionAnswering(MixtralPreTrainedModel): + base_model_prefix = "model" + + # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Mixtral + def __init__(self, config): + super().__init__(config) + self.model = MixtralModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/qwen2/__init__.py b/src/transformers/models/qwen2/__init__.py index 35df37e91a98c4..301531655a1db5 100644 --- a/src/transformers/models/qwen2/__init__.py +++ b/src/transformers/models/qwen2/__init__.py @@ -42,6 +42,7 @@ else: _import_structure["modeling_qwen2"] = [ "Qwen2ForCausalLM", + "Qwen2ForQuestionAnswering", "Qwen2Model", "Qwen2PreTrainedModel", "Qwen2ForSequenceClassification", @@ -69,6 +70,7 @@ else: from .modeling_qwen2 import ( Qwen2ForCausalLM, + Qwen2ForQuestionAnswering, Qwen2ForSequenceClassification, Qwen2ForTokenClassification, Qwen2Model, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 50f273ba766ca9..2585352fc9594d 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -34,6 +34,7 @@ from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -1508,3 +1509,103 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """ +The Qwen2 Model transformer with a span classification head on top for extractive question-answering tasks like +SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + QWEN2_START_DOCSTRING, +) +# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen2, MISTRAL->QWEN2 +class Qwen2ForQuestionAnswering(Qwen2PreTrainedModel): + base_model_prefix = "model" + + # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Qwen2 + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/qwen2_moe/__init__.py b/src/transformers/models/qwen2_moe/__init__.py index e2b73ba2d1f9c4..9520141ea831fc 100644 --- a/src/transformers/models/qwen2_moe/__init__.py +++ b/src/transformers/models/qwen2_moe/__init__.py @@ -33,6 +33,7 @@ else: _import_structure["modeling_qwen2_moe"] = [ "Qwen2MoeForCausalLM", + "Qwen2MoeForQuestionAnswering", "Qwen2MoeModel", "Qwen2MoePreTrainedModel", "Qwen2MoeForSequenceClassification", @@ -51,6 +52,7 @@ else: from .modeling_qwen2_moe import ( Qwen2MoeForCausalLM, + Qwen2MoeForQuestionAnswering, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification, Qwen2MoeModel, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 2ab13b7227ada6..1a5f6e2ff2fbdc 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -35,6 +35,7 @@ from ...modeling_outputs import ( MoeCausalLMOutputWithPast, MoeModelOutputWithPast, + QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) @@ -1713,3 +1714,103 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """ +The Qwen2MoE Model transformer with a span classification head on top for extractive question-answering tasks like +SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + QWEN2MOE_START_DOCSTRING, +) +# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen2Moe, MISTRAL->QWEN2MOE +class Qwen2MoeForQuestionAnswering(Qwen2MoePreTrainedModel): + base_model_prefix = "model" + + # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Qwen2Moe + def __init__(self, config): + super().__init__(config) + self.model = Qwen2MoeModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 048de1cc8ae77a..4ca25bc7914a1c 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -5920,6 +5920,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MistralForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MistralForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -5955,6 +5962,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MixtralForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MixtralForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -7406,6 +7420,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Qwen2ForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Qwen2ForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -7462,6 +7483,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Qwen2MoeForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Qwen2MoeForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index c24436d4b863f7..ff7f1e87bc1972 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -47,6 +47,7 @@ from transformers import ( MistralForCausalLM, + MistralForQuestionAnswering, MistralForSequenceClassification, MistralForTokenClassification, MistralModel, @@ -291,7 +292,13 @@ def prepare_config_and_inputs_for_common(self): @require_torch class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (MistralModel, MistralForCausalLM, MistralForSequenceClassification, MistralForTokenClassification) + ( + MistralModel, + MistralForCausalLM, + MistralForSequenceClassification, + MistralForTokenClassification, + MistralForQuestionAnswering, + ) if is_torch_available() else () ) @@ -303,6 +310,7 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi "token-classification": MistralForTokenClassification, "text-generation": MistralForCausalLM, "zero-shot": MistralForSequenceClassification, + "question-answering": MistralForQuestionAnswering, } if is_torch_available() else {} diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index a2655bb773dcdd..0e6b2a999e89a9 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -41,6 +41,7 @@ from transformers import ( MixtralForCausalLM, + MixtralForQuestionAnswering, MixtralForSequenceClassification, MixtralForTokenClassification, MixtralModel, @@ -291,7 +292,13 @@ def prepare_config_and_inputs_for_common(self): # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Mixtral class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification, MixtralForTokenClassification) + ( + MixtralModel, + MixtralForCausalLM, + MixtralForSequenceClassification, + MixtralForTokenClassification, + MixtralForQuestionAnswering, + ) if is_torch_available() else () ) @@ -303,6 +310,7 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi "token-classification": MixtralForTokenClassification, "text-generation": MixtralForCausalLM, "zero-shot": MixtralForSequenceClassification, + "question-answering": MixtralForQuestionAnswering, } if is_torch_available() else {} diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index debcf42ab38fad..5e5c42d4c56630 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -43,6 +43,7 @@ from transformers import ( Qwen2ForCausalLM, + Qwen2ForQuestionAnswering, Qwen2ForSequenceClassification, Qwen2ForTokenClassification, Qwen2Model, @@ -300,7 +301,13 @@ def prepare_config_and_inputs_for_common(self): # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2 class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (Qwen2Model, Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2ForTokenClassification) + ( + Qwen2Model, + Qwen2ForCausalLM, + Qwen2ForSequenceClassification, + Qwen2ForTokenClassification, + Qwen2ForQuestionAnswering, + ) if is_torch_available() else () ) @@ -312,6 +319,7 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi "token-classification": Qwen2ForTokenClassification, "text-generation": Qwen2ForCausalLM, "zero-shot": Qwen2ForSequenceClassification, + "question-answering": Qwen2ForQuestionAnswering, } if is_torch_available() else {} diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 60df825c9b8a8d..d7b17b740f9e85 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -43,6 +43,7 @@ from transformers import ( Qwen2MoeForCausalLM, + Qwen2MoeForQuestionAnswering, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification, Qwen2MoeModel, @@ -327,7 +328,13 @@ def prepare_config_and_inputs_for_common(self): # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2Moe class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (Qwen2MoeModel, Qwen2MoeForCausalLM, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification) + ( + Qwen2MoeModel, + Qwen2MoeForCausalLM, + Qwen2MoeForSequenceClassification, + Qwen2MoeForTokenClassification, + Qwen2MoeForQuestionAnswering, + ) if is_torch_available() else () ) @@ -339,6 +346,7 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM "token-classification": Qwen2MoeForTokenClassification, "text-generation": Qwen2MoeForCausalLM, "zero-shot": Qwen2MoeForSequenceClassification, + "question-answering": Qwen2MoeForQuestionAnswering, } if is_torch_available() else {} From 4c439173df979b78d9675332a7abad4336ccf844 Mon Sep 17 00:00:00 2001 From: PengWeixuan <145038191+PengWeixuan@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:15:25 +0800 Subject: [PATCH 010/385] Fix a typo (#34148) Correct a typo "If you want you tokenizer..."->"If you want your tokenizer...." --- src/transformers/models/llama/convert_llama_weights_to_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index 99aa198bf62c94..2edf41ade3f91e 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -54,7 +54,7 @@ Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -If you want you tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor: +If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor: ```py from tokenizers import processors From cb5ca3265fc7aa4d003c160fe1e344a401066b8b Mon Sep 17 00:00:00 2001 From: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:22:49 +0200 Subject: [PATCH 011/385] Add GGUF for starcoder2 (#34094) * add starcoder2 arch support for gguf * fix q6 test --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 24 +++++++++++++++ tests/quantization/ggml/test_ggml.py | 44 +++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 7418bbc497e660..01583cedbf4110 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -84,6 +84,7 @@ For now the supported model architectures are the architectures that have been v - Falcon - StableLM - GPT2 +- Starcoder2 ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index cc317b18b052ed..7b5828176ffcf4 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -176,6 +176,20 @@ "ffn_up": "mlp.c_fc", "ffn_down": "mlp.c_proj", }, + "starcoder2": { + "token_embd": "model.embed_tokens", + "blk": "model.layers", + "ffn_up": "mlp.c_fc", + "ffn_down": "mlp.c_proj", + "ffn_norm": "post_attention_layernorm", + "attn_norm": "input_layernorm", + "attn_q": "self_attn.q_proj", + "attn_v": "self_attn.v_proj", + "attn_k": "self_attn.k_proj", + "attn_output": "self_attn.o_proj", + "output.weight": "lm_head.weight", + "output_norm": "model.norm", + }, } @@ -292,6 +306,15 @@ "attention.head_count": "n_head", "attention.layer_norm_epsilon": "layer_norm_epsilon", }, + "starcoder2": { + "block_count": "num_hidden_layers", + "context_length": "max_position_embeddings", + "embedding_length": "hidden_size", + "feed_forward_length": "intermediate_size", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_epsilon": "norm_epsilon", + }, } GGUF_TOKENIZER_MAPPING = { @@ -622,6 +645,7 @@ def converted(self) -> Tokenizer: "falcon": GGUFGPTConverter, "stablelm": GGUFGPTConverter, "gpt2": GGUFGPTConverter, + "starcoder2": GGUFGPTConverter, } diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 3074a19828d25b..6e47d46f07c47e 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -54,6 +54,9 @@ class GgufIntegrationTests(unittest.TestCase): gpt2_model_id = "mradermacher/gpt2-GGUF" gpt2_original_model_id = "openai-community/gpt2" gpt2_xl_model_id = "RichardErkhov/openai-community_-_gpt2-xl-gguf" + starcoder2_model_id = "QuantFactory/starcoder2-3b-GGUF" + starcoder2_fp16_model_id = "brittlewis12/starcoder2-3b-GGUF" + starcoder2_original_model_id = "bigcode/starcoder2-3b" # standard quants q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" @@ -93,6 +96,8 @@ class GgufIntegrationTests(unittest.TestCase): fp16_gpt2_model_id = "gpt2.f16.gguf" q8_gpt2_model_id = "gpt2.Q8_0.gguf" q6_k_gpt2_xl_model_id = "gpt2-xl.Q6_K.gguf" + q6_k_starcoder2_model_id = "starcoder2-3b.Q6_K.gguf" + fp16_starcoder2_gguf_model_id = "starcoder2-3b.fp16.gguf" example_text = "Hello" @@ -650,6 +655,45 @@ def test_stablelm_weights_conversion_fp16(self): self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + def test_starcoder2_weights_conversion_fp16(self): + original_model = AutoModelForCausalLM.from_pretrained( + self.starcoder2_original_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + + converted_model = AutoModelForCausalLM.from_pretrained( + self.starcoder2_fp16_model_id, + gguf_file=self.fp16_starcoder2_gguf_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + + converted_state_dict = converted_model.state_dict() + original_state_dict = original_model.state_dict() + + for layer_name, original_params in original_state_dict.items(): + if layer_name in converted_state_dict and layer_name != "lm_head.weight": + # quantized models do not contain "lm_head.weight" layer + self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) + torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + + def test_starcoder2_q6_k(self): + example_function_text = "def print_hello_world():" + model = AutoModelForCausalLM.from_pretrained( + self.starcoder2_model_id, + gguf_file=self.q6_k_starcoder2_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.starcoder2_model_id, gguf_file=self.q6_k_starcoder2_model_id) + text = tokenizer(example_function_text, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = 'def print_hello_world():\n print("Hello World")\n\ndef print' + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_tokenization_xnli(self): import tqdm from datasets import load_dataset From 013d3ac2b5924058c78f4df943339b403714ef9a Mon Sep 17 00:00:00 2001 From: Diogo Miguel Silva <43959937+dmgcsilva@users.noreply.github.com> Date: Mon, 14 Oct 2024 09:30:35 +0100 Subject: [PATCH 012/385] Fixed error message in mllama (#34106) --- src/transformers/models/mllama/processing_mllama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index eb092f021f6368..eea98f5bd66ac2 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -302,7 +302,7 @@ def __call__( raise ValueError("No image were provided, but there are image tokens in the prompt") else: raise ValueError( - f"The number of image token ({sum(n_images_in_images)}) should be the same as in the number of provided images ({sum(n_images_in_images)})" + f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})" ) if images is not None: From 5114c9b9e9c1df889f72cb1b7ddd023760bf9233 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 14 Oct 2024 11:40:39 +0200 Subject: [PATCH 013/385] Specify that users should be careful with their own files (#34153) * Informative * style --- .../models/marian/convert_marian_to_pytorch.py | 7 ++++++- .../models/mobilevitv2/convert_mlcvnets_to_pytorch.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py index f086e480dfffdc..3938bc794571b4 100644 --- a/src/transformers/models/marian/convert_marian_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_to_pytorch.py @@ -701,7 +701,12 @@ def unzip(zip_path: str, dest_dir: str) -> None: """ parser = argparse.ArgumentParser() # Required parameters - parser.add_argument("--src", type=str, help="path to marian model sub dir", default="en-de") + parser.add_argument( + "--src", + type=str, + help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be weary of which file you're loading.", + default="en-de", + ) parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.") args = parser.parse_args() diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py index 518dc949a47b96..e0aed91cd28977 100644 --- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py +++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py @@ -314,7 +314,12 @@ def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, parser.add_argument( "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)." ) - parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.") + parser.add_argument( + "--orig_config_path", + required=True, + type=str, + help="Path to the original config file. yaml.load will be used to load the file, please be weary of which file you're loading.", + ) parser.add_argument( "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory." ) From fa3f2db5c7405a742fcb8f686d3754f70db00977 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Mon, 14 Oct 2024 11:58:45 +0200 Subject: [PATCH 014/385] Add documentation for docker (#33156) * initial commit * nit --- docker/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docker/README.md diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000000000..2a71ab6fb6ec8f --- /dev/null +++ b/docker/README.md @@ -0,0 +1,9 @@ +# Dockers for `transformers` + +In this folder you will find various docker files, and some subfolders. +- dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). +- subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs) + +Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. + +We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: \ No newline at end of file From dd4216b766470b90c9492eec8a3af1a203a12c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Mu=C5=A1tar?= Date: Tue, 15 Oct 2024 10:45:22 +0200 Subject: [PATCH 015/385] Update README.md with Enterprise Hub (#34150) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a2325ae037624e..68e2a215d4cdd6 100644 --- a/README.md +++ b/README.md @@ -128,10 +128,10 @@ incredible projects built in the vicinity of transformers. If you own or use a project that you believe should be part of the list, please open a PR to add it! -## If you are looking for custom support from the Hugging Face team +## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub. - - HuggingFace Expert Acceleration Program + + Hugging Face Enterprise Hub
## Quick tour From 23874f59486ccd79bf224ab7b42bc9052c63f1df Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 15 Oct 2024 11:17:14 +0200 Subject: [PATCH 016/385] Idefics: enable generation tests (#34062) * add idefics * conflicts after merging main * enable tests but need to fix some * fix tests * no print * fix/skip some slow tests * continue not skip * rebasing broken smth, this is the fix --- .../generation/candidate_generator.py | 11 +- src/transformers/generation/utils.py | 4 +- .../models/idefics/modeling_idefics.py | 37 ++-- .../models/idefics2/modeling_idefics2.py | 48 ++--- .../models/idefics3/modeling_idefics3.py | 55 ++--- tests/generation/test_utils.py | 17 +- tests/models/idefics/test_modeling_idefics.py | 188 +++++++++++++++++- .../models/idefics2/test_modeling_idefics2.py | 70 ++++++- .../models/idefics3/test_modeling_idefics3.py | 68 +++++++ tests/test_modeling_common.py | 4 +- 10 files changed, 406 insertions(+), 96 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index a4c8f79ae925d1..1e4d7a4702453a 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -726,14 +726,23 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en elif mask_length_diff > 0: model_kwargs[mask_key] = torch.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1) + # Handle cross attention models if "cross_attention_mask" in model_kwargs: - # Mllama case is special and has another mask for cross attention model + # Mllama case cross_mask = model_kwargs["cross_attention_mask"] if mask_length_diff < 0: model_kwargs["cross_attention_mask"] = cross_mask[:, :mask_length_diff] elif mask_length_diff > 0: new_mask = cross_mask[:, -1:, :, :].repeat(1, mask_length_diff, 1, 1) model_kwargs["cross_attention_mask"] = torch.cat([cross_mask, new_mask], dim=1) + elif "image_attention_mask" in model_kwargs: + # IDEFICS case + cross_mask = model_kwargs["image_attention_mask"] + if mask_length_diff < 0: + model_kwargs["image_attention_mask"] = cross_mask[:, :mask_length_diff] + elif mask_length_diff > 0: + new_mask = cross_mask[:, -1:, :].repeat(1, mask_length_diff, 1) + model_kwargs["image_attention_mask"] = torch.cat([cross_mask, new_mask], dim=1) return model_kwargs diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 68b8b598ec0978..09be2f6bc224ee 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -2005,6 +2005,7 @@ def generate( # generating the first new token or not, and we only want to use the embeddings for the first new token) if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": model_kwargs["use_cache"] = True + generation_config.use_cache = True else: model_kwargs["use_cache"] = generation_config.use_cache @@ -4299,7 +4300,8 @@ def _assisted_decoding( newly_added_length, is_decoder_attention=True, ) - else: + # some (V)LLMs have hard requirement on SDPA and thus never return attn + elif outputs.attentions[0] is not None: decoder_attentions = _split_model_outputs( decoder_attentions, outputs.attentions, diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 02de8d61ae204c..81159ee1c0cd30 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -28,12 +28,12 @@ from torch import nn from torch.nn import CrossEntropyLoss -from ... import PreTrainedModel from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ModelOutput -from ...modeling_utils import PretrainedConfig +from ...modeling_utils import PretrainedConfig, PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( add_start_docstrings, @@ -622,11 +622,9 @@ def forward( query_states = self.q_layer_norm(query_states) key_states = self.k_layer_norm(key_states) + causal_mask = attention_mask if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, # Reference: https://github.com/pytorch/pytorch/issues/112577. @@ -638,13 +636,13 @@ def forward( # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False + is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, - attn_mask=attention_mask, + attn_mask=causal_mask, dropout_p=self.dropout if self.training else 0.0, is_causal=is_causal, ) @@ -1490,7 +1488,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask -class IdeficsForVisionText2Text(IdeficsPreTrainedModel): +class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin): _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"] @@ -1670,6 +1668,7 @@ def prepare_inputs_for_generation( position_ids=None, pixel_values=None, image_hidden_states=None, + image_attention_mask=None, use_cache=None, cache_position=None, **kwargs, @@ -1678,6 +1677,8 @@ def prepare_inputs_for_generation( if past_key_values is not None: if input_ids.shape[1] != cache_position.shape[0]: input_ids = input_ids[:, cache_position] + if image_attention_mask is not None: + image_attention_mask = image_attention_mask[:, -input_ids.shape[1] :] if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation @@ -1696,7 +1697,8 @@ def prepare_inputs_for_generation( model_inputs["perceiver_embeddings"] = image_hidden_states else: model_inputs["image_encoder_embeddings"] = image_hidden_states - pixel_values = None + else: + model_inputs["pixel_values"] = pixel_values model_inputs.update( { @@ -1706,21 +1708,13 @@ def prepare_inputs_for_generation( "cache_position": cache_position, "position_ids": position_ids, "attention_mask": attention_mask, - "pixel_values": pixel_values, - "image_attention_mask": kwargs.get("image_attention_mask", None), + "image_attention_mask": image_attention_mask, "interpolate_pos_encoding": kwargs.get("interpolate_pos_encoding", False), } ) return model_inputs - @staticmethod - def _expand_inputs_for_generation( - *args, - **model_kwargs, - ): - return expand_inputs_for_generation(*args, **model_kwargs) - def _update_model_kwargs_for_generation( self, outputs: ModelOutput, @@ -1738,7 +1732,10 @@ def _update_model_kwargs_for_generation( if "image_attention_mask" in model_kwargs: image_attention_mask = model_kwargs["image_attention_mask"] last_mask = image_attention_mask[:, -1, :].unsqueeze(1) - model_kwargs["image_attention_mask"] = last_mask + if model_kwargs.get("use_cache", True): + model_kwargs["image_attention_mask"] = last_mask + else: + model_kwargs["image_attention_mask"] = torch.cat([image_attention_mask, last_mask], dim=1) # Get the precomputed image_hidden_states model_kwargs["image_hidden_states"] = outputs.image_hidden_states diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index b53d0722587d5a..d34e0acde4c814 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1427,6 +1427,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, + use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, @@ -1657,35 +1658,19 @@ def prepare_inputs_for_generation( past_key_values=None, attention_mask=None, inputs_embeds=None, + cache_position=None, + pixel_values=None, + pixel_attention_mask=None, + image_hidden_states=None, num_logits_to_keep=None, **kwargs, ): - past_length = 0 - # Omit tokens covered by past_key_values + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: - # Past key values are always initialized with a `Cache` object -> no need for if-else anymore - past_length = past_key_values.get_seq_length() - max_cache_length = past_key_values.get_max_cache_shape() - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and past_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: + input_ids = input_ids[:, cache_position] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: @@ -1696,21 +1681,22 @@ def prepare_inputs_for_generation( position_ids = position_ids[:, -input_ids.shape[1] :] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_length == 0: - model_inputs = {"inputs_embeds": inputs_embeds} + # but IDEFICS requires noth ids and embeds to be present + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids} else: - model_inputs = {"input_ids": input_ids} + # The clone here is for the same reason as for `position_ids`. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} if num_logits_to_keep is not None: model_inputs["num_logits_to_keep"] = num_logits_to_keep - image_hidden_states = kwargs.get("image_hidden_states", None) if image_hidden_states is not None: pixel_values = None pixel_attention_mask = None else: - pixel_values = kwargs.get("pixel_values", None) - pixel_attention_mask = kwargs.get("pixel_attention_mask", None) + pixel_values = pixel_values + pixel_attention_mask = pixel_attention_mask model_inputs.update( { "position_ids": position_ids, diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 757391175ea671..e653fd3d2a6ba2 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -24,7 +24,8 @@ from ... import PreTrainedModel from ...activations import ACT2FN -from ...cache_utils import Cache +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_outputs import BaseModelOutput, ModelOutput from ...utils import ( @@ -953,6 +954,8 @@ def forward( past_seen_tokens = 0 if use_cache: + if past_key_values is None: + past_key_values = DynamicCache() past_seen_tokens = past_key_values.get_seq_length() if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0: @@ -1019,6 +1022,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, + use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, @@ -1040,7 +1044,7 @@ def forward( """The Idefics3 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """, IDEFICS3_START_DOCSTRING, ) -class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel): +class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.__init__ with Idefics2->Idefics3 @@ -1245,35 +1249,19 @@ def prepare_inputs_for_generation( past_key_values=None, attention_mask=None, inputs_embeds=None, + cache_position=None, + pixel_values=None, + pixel_attention_mask=None, + image_hidden_states=None, num_logits_to_keep=None, **kwargs, ): - past_length = 0 - # Omit tokens covered by past_key_values + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: - # Past key values are always initialized with a `Cache` object -> no need for if-else anymore - past_length = past_key_values.get_seq_length() - max_cache_length = past_key_values.get_max_cache_shape() - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and past_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: + input_ids = input_ids[:, cache_position] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: @@ -1284,21 +1272,22 @@ def prepare_inputs_for_generation( position_ids = position_ids[:, -input_ids.shape[1] :] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_length == 0: - model_inputs = {"inputs_embeds": inputs_embeds} + # but IDEFICS requires noth ids and embeds to be present + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids} else: - model_inputs = {"input_ids": input_ids} + # The clone here is for the same reason as for `position_ids`. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} if num_logits_to_keep is not None: model_inputs["num_logits_to_keep"] = num_logits_to_keep - image_hidden_states = kwargs.get("image_hidden_states", None) if image_hidden_states is not None: pixel_values = None pixel_attention_mask = None else: - pixel_values = kwargs.get("pixel_values", None) - pixel_attention_mask = kwargs.get("pixel_attention_mask", None) + pixel_values = pixel_values + pixel_attention_mask = pixel_attention_mask model_inputs.update( { "position_ids": position_ids, diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 5d92e8ce216aa1..a1bc526566726f 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -153,7 +153,11 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None): # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens. if config is not None: - image_token_index = config.image_token_index if hasattr(config, "image_token_index") else None + image_token_index = ( + config.image_token_index + if getattr(config, "image_token_index", None) is not None + else getattr(config, "image_token_id", None) + ) video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None if image_token_index is not None and image_token_index < config.get_text_config().vocab_size: logits_processor_kwargs["bad_words_ids"].append([image_token_index]) @@ -1496,13 +1500,14 @@ def test_past_key_values_format(self): if "past_key_values" not in outputs: self.skipTest(reason="This model doesn't return `past_key_values`") + text_config = config.get_text_config() num_hidden_layers = ( - getattr(config, "decoder_layers", None) - or getattr(config, "num_decoder_layers", None) - or config.num_hidden_layers + getattr(text_config, "decoder_layers", None) + or getattr(text_config, "num_decoder_layers", None) + or text_config.num_hidden_layers ) - num_attention_heads = getattr(config, "decoder_attention_heads", config.num_attention_heads) - embed_dim = getattr(config, "d_model", config.hidden_size) + num_attention_heads = getattr(text_config, "decoder_attention_heads", text_config.num_attention_heads) + embed_dim = getattr(text_config, "d_model", text_config.hidden_size) per_head_embed_dim = embed_dim // num_attention_heads past_kv = outputs["past_key_values"] diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index a49bce8d878fb4..62b6ca22293b6f 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -14,8 +14,10 @@ # limitations under the License. """Testing suite for the PyTorch Idefics model.""" +import inspect import unittest +import pytest from parameterized import parameterized from transformers import BitsAndBytesConfig, IdeficsConfig, is_torch_available, is_vision_available @@ -31,6 +33,7 @@ ) from transformers.utils import cached_property +from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask from ...test_pipeline_mixin import PipelineTesterMixin @@ -318,6 +321,12 @@ def prepare_pixel_values(self): def test_eager_matches_sdpa_inference(self, torch_dtype: str): self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") + @require_torch_sdpa + @slow + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + def test_eager_matches_sdpa_generate(self): + self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") + @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch @@ -580,8 +589,9 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch -class IdeficsForVisionText2TextTest(IdeficsModelTest, unittest.TestCase): +class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, unittest.TestCase): all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else () + all_generative_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else () def setUp(self): self.model_tester = IdeficsModelTester( @@ -590,6 +600,182 @@ def setUp(self): ) self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + @pytest.mark.generate + def test_left_padding_compatibility(self): + """Overwrite because IDEFICS needs image attention mask to be also padded""" + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + def _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature): + model_kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "image_attention_mask": image_attention_mask, + } + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in self.all_generative_model_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict.pop("input_ids") + attention_mask = inputs_dict.pop("attention_mask") + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + image_attention_mask = inputs_dict.pop("image_attention_mask", None) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature) + next_logits_wo_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + + pad_size_img = (input_ids.shape[0], 32, image_attention_mask.shape[-1]) + extra_img_mask = torch.zeros(pad_size_img, dtype=image_attention_mask.dtype, device=torch_device) + padded_image_attention_mask = torch.cat([extra_img_mask, image_attention_mask], dim=1) + model_kwargs = _prepare_model_kwargs( + padded_input_ids, padded_attention_mask, padded_image_attention_mask, signature + ) + next_logits_with_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @pytest.mark.generate + def test_generate_continue_from_past_key_values(self): + """Overwrite because IDEFICS needs image attention mask to be also processed""" + + # Tests that we can continue generating from past key values, returned from a previous `generate` call + for model_class in self.all_generative_model_classes: + config, inputs = self.model_tester.prepare_config_and_inputs_for_common() + + # Let's make it always: + # 1. use cache (for obvious reasons) + # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which + # would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the + # continuation would force it to generate beyond an EOS token) + # 3. ignore `token_type_ids` for simplicity + # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is + # active by default on some models + # 5. ignore `encoder_no_repeat_ngram_size`, which is set by default in some encoder-decoder models. When + # we use their decoder as a stand-alone model, `encoder_no_repeat_ngram_size` actually prevents + # repetition exclusively from the prompt. This test relies on comparing one call vs 2 calls + # with cache, what is considered a prompt is different in the two cases. + + model = model_class(config).to(torch_device) + model.eval() + model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1 + model.generation_config.forced_eos_token_id = None + model.generation_config.encoder_no_repeat_ngram_size = 0 + model.generation_config.use_cache = True + + # Traditional way of generating text, with `return_dict_in_generate` to return the past key values + outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True) + + # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the + # inputs may need to be tweaked across `generate` calls (like the attention mask). + outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True) + + # Continue from the tokens generated above, preparing the inputs accordingly + inputs["past_key_values"] = outputs_cached.past_key_values + new_attention_len = outputs_cached.sequences.shape[-1] + inputs["input_ids"] = outputs_cached.sequences + if "attention_mask" in inputs: + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], + (0, new_attention_len - inputs["attention_mask"].shape[1]), + mode="constant", + value=1, + ) + if "image_attention_mask" in inputs: + inputs["image_attention_mask"] = inputs["image_attention_mask"][:, -1:, :] + + outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True) + + # The two sets of generated text and past kv should be equal to each other + self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist()) + for layer_idx in range(len(outputs_cached.past_key_values)): + for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])): + self.assertTrue( + torch.allclose( + outputs.past_key_values[layer_idx][kv_idx], + outputs_cached.past_key_values[layer_idx][kv_idx], + ) + ) + + @pytest.mark.generate + def test_generate_without_input_ids(self): + """Overwrite because IDEFICS needs image attention mask to be also processed and requires image at input always.""" + + config, input_dict = self.prepare_config_and_inputs_for_generate() + pixel_values = input_dict["pixel_values"] + image_attention_mask = input_dict["image_attention_mask"][:, -1:, :] + + # hack in case they are equal, otherwise the attn mask will be [0] + if config.bos_token_id == config.pad_token_id: + config.pad_token_id = None + + for model_class in self.all_generative_model_classes: + model = model_class(config).to(torch_device) + model.eval() + + output_ids_generate = model.generate( + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + do_sample=False, + max_new_tokens=self.max_new_tokens, + remove_invalid_values=True, + ) + self.assertIsNotNone(output_ids_generate) + + def _check_attentions_for_generate( + self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1 + ): + """ + Overwrite from generation tests because Idefics has only SDPA layers. + Do not skip because we still want generation tests to run. Rather we can remove checks for shape. + """ + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip(reason="We only test the model that takes in multiple images") + def test_custom_4d_attention_mask(self): + pass + + @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") + def test_generate_compile_fullgraph(self): + pass + @unittest.skip(reason="We only test the model that takes in multiple images") def test_model(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index e02c5b4c9f09c6..f87e87607c2a17 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -19,6 +19,7 @@ import unittest from io import BytesIO +import pytest import requests from transformers import ( @@ -96,7 +97,7 @@ def __init__( "pad_token_id": 0, # None in the original configuration_mistral, we set it to the unk_token_id "bos_token_id": 1, "eos_token_id": 2, - "image_token_id": 32_001, + "image_token_id": 99, "tie_word_embeddings": False, "rope_theta": 10000.0, "sliding_window": 32, @@ -334,6 +335,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest """ all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_pruning = False test_resize_embeddings = True @@ -356,6 +358,72 @@ def test_flash_attn_2_generate_padding_right(self): def test_flash_attn_2_inference_padding_right(self): pass + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip( + reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates" + ) + def test_prompt_lookup_decoding_matches_greedy_search(self): + pass + + @unittest.skip(reason=" FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @pytest.mark.generate + def test_generate_from_inputs_embeds_decoder_only(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + for model_class in self.all_generative_model_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + + # Ignore: + # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, + # which would cause a mismatch), + config.pad_token_id = config.eos_token_id = -1 + config.is_decoder = True + model = model_class(config).to(torch_device).eval() + input_ids = inputs_dict.pop("input_ids") + + # Traditional way of generating text + outputs_from_ids = model.generate( + input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + ) + self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) + + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + inputs_embeds = model.get_input_embeddings()(input_ids) + outputs_from_embeds = model.generate( + input_ids, + inputs_embeds=inputs_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + ) + self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) + + # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # same, but the logits will almost surely be different) + random_embeds = torch.rand_like(inputs_embeds) + outputs_from_rand_embeds = model.generate( + input_ids, + inputs_embeds=random_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + ) + for i in range(len(outputs_from_rand_embeds.scores)): + self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) + # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 550bb2785e0057..44e06b07c54752 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -19,6 +19,7 @@ import unittest from io import BytesIO +import pytest import requests from transformers import ( @@ -321,6 +322,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest """ all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_pruning = False test_resize_embeddings = True @@ -343,6 +345,72 @@ def test_flash_attn_2_generate_padding_right(self): def test_flash_attn_2_inference_padding_right(self): pass + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip( + reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates" + ) + def test_prompt_lookup_decoding_matches_greedy_search(self): + pass + + @unittest.skip(reason=" FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @pytest.mark.generate + def test_generate_from_inputs_embeds_decoder_only(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + for model_class in self.all_generative_model_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + + # Ignore: + # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, + # which would cause a mismatch), + config.pad_token_id = config.eos_token_id = -1 + config.is_decoder = True + model = model_class(config).to(torch_device).eval() + input_ids = inputs_dict.pop("input_ids") + + # Traditional way of generating text + outputs_from_ids = model.generate( + input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + ) + self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) + + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + inputs_embeds = model.get_input_embeddings()(input_ids) + outputs_from_embeds = model.generate( + input_ids, + inputs_embeds=inputs_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + ) + self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) + + # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # same, but the logits will almost surely be different) + random_embeds = torch.rand_like(inputs_embeds) + outputs_from_rand_embeds = model.generate( + input_ids, + inputs_embeds=random_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + ) + for i in range(len(outputs_from_rand_embeds.scores)): + self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) + # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index fa4a35391baf25..38c1f5ff1774b7 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4768,7 +4768,7 @@ def test_flash_attn_2_from_config(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() # TODO: to change it in the future with other relevant auto classes - fa2_model = AutoModelForCausalLM.from_config( + fa2_model = model_class._from_config( config, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16 ).to(torch_device) @@ -4789,7 +4789,7 @@ def test_flash_attn_2_from_config(self): with tempfile.TemporaryDirectory() as tmpdirname: fa2_model.save_pretrained(tmpdirname) - model_from_pretrained = AutoModelForCausalLM.from_pretrained(tmpdirname) + model_from_pretrained = model_class.from_pretrained(tmpdirname) self.assertTrue(model_from_pretrained.config._attn_implementation != "flash_attention_2") From 293e6271c69a48b6a66e68978dd3d37601c04c63 Mon Sep 17 00:00:00 2001 From: Prakarsh Kaushik <66624139+RUFFY-369@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:57:54 +0530 Subject: [PATCH 017/385] Add sdpa for Vivit (#33757) * chore:add sdpa to vivit * fix:failing slow test_inference_interpolate_pos_encoding(failing on main branch too) * chore:fix nits * ci:fix repo consistency failure * chore:add info and benchmark to model doc * [run_slow] vivit * chore:revert interpolation test fix for new issue * [run_slow] vivit * [run_slow] vivit * [run_slow] vivit * chore:add fallback for output_attentions being True * [run_slow] vivit * style:make fixup * [run_slow] vivit --- docs/source/en/model_doc/vivit.md | 37 +++++++++++ docs/source/en/perf_infer_gpu_one.md | 1 + .../models/vivit/modeling_vivit.py | 61 ++++++++++++++++++- tests/models/vivit/test_modeling_vivit.py | 6 ++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md index 4426493a0ff585..c3e3df14ab988b 100644 --- a/docs/source/en/model_doc/vivit.md +++ b/docs/source/en/model_doc/vivit.md @@ -23,6 +23,43 @@ The abstract from the paper is the following: This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit). +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import VivitModel +model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vivit-b-16x2-kinetics400` model, we saw the following speedups during inference. + +### Training +| num_training_steps | batch_size | is cuda | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) | +|---------------------:|-------------:|----------:|--------------:|----------------------:|---------------------:|-----------------:| +| 100 | 1 | True | 7.122 | 2575.28 | 5932.54 | 130.364 | + + + +### Inference +| num_batches | batch_size | is cuda | is half | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) | +|---------------|--------------|-----------|-----------|---------------|------------------|---------------|-----------------| +| 20 | 1 | True | False | 15.422 | 715.807 | 317.079 | 125.75 | +| 20 | 2 | True | False | 17.146 | 1234.75 | 447.175 | 176.122 | +| 20 | 4 | True | False | 18.093 | 2275.82 | 709.864 | 220.6 | +| 20 | 8 | True | False | 19.284 | 4358.19 | 1233.24 | 253.393 | + + ## VivitConfig [[autodoc]] VivitConfig diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index cf2dac617ffa3d..82d7f50f77d902 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -278,6 +278,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel) * [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel) * [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell) +* [ViViT](https://huggingface.co/docs/transformers/model_doc/vivit#transformers.VivitModel) * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model) * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel) * [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel) diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py index 3d543503284489..9b6516a25af45b 100755 --- a/src/transformers/models/vivit/modeling_vivit.py +++ b/src/transformers/models/vivit/modeling_vivit.py @@ -227,6 +227,51 @@ def forward( return outputs +# Adapted from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->Vivit +class VivitSdpaSelfAttention(VivitSelfAttention): + def __init__(self, config: VivitConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "VivitSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support" + " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying" + " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be" + ' removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + head_mask, + output_attentions, + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Vivit class VivitSelfOutput(nn.Module): """ @@ -286,6 +331,13 @@ def forward( return outputs +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->Vivit +class VivitSdpaAttention(VivitAttention): + def __init__(self, config: VivitConfig) -> None: + super().__init__(config) + self.attention = VivitSdpaSelfAttention(config) + + class VivitIntermediate(nn.Module): def __init__(self, config): super().__init__() @@ -320,6 +372,12 @@ def forward(self, hidden_states, input_tensor): return hidden_states +VIVIT_ATTENTION_CLASSES = { + "eager": VivitAttention, + "sdpa": VivitSdpaAttention, +} + + class VivitLayer(nn.Module): """This corresponds to the EncoderBlock class in the scenic/vivit implementation.""" @@ -327,7 +385,7 @@ def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 - self.attention = VivitAttention(config) + self.attention = VIVIT_ATTENTION_CLASSES[config._attn_implementation](config) self.intermediate = VivitIntermediate(config) self.output = VivitOutput(config) self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -436,6 +494,7 @@ class VivitPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = [] + _supports_sdpa = True def _init_weights(self, module): """Initialize the weights""" diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py index 7cce77e6fc0019..8e6b0825948d40 100644 --- a/tests/models/vivit/test_modeling_vivit.py +++ b/tests/models/vivit/test_modeling_vivit.py @@ -65,6 +65,8 @@ def __init__( layer_norm_eps=1e-06, qkv_bias=True, scope=None, + attn_implementation="eager", + mask_ratio=0.5, ): self.parent = parent self.batch_size = batch_size @@ -86,12 +88,15 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.qkv_bias = qkv_bias self.scope = scope + self.attn_implementation = attn_implementation self.seq_length = ( (self.image_size // self.tubelet_size[2]) * (self.image_size // self.tubelet_size[1]) * (self.num_frames // self.tubelet_size[0]) ) + 1 # CLS token + self.mask_ratio = mask_ratio + self.num_masks = int(mask_ratio * self.seq_length) def prepare_config_and_inputs(self): pixel_values = floats_tensor( @@ -122,6 +127,7 @@ def get_config(self): initializer_range=self.initializer_range, layer_norm_eps=self.layer_norm_eps, qkv_bias=self.qkv_bias, + attn_implementation=self.attn_implementation, ) config.num_labels = self.num_labels return config From 4de1bdbf637fe6411c104c62ab385f660bfb1064 Mon Sep 17 00:00:00 2001 From: Shikhar Mishra <77426122+Itssshikhar@users.noreply.github.com> Date: Tue, 15 Oct 2024 17:18:10 +0530 Subject: [PATCH 018/385] Fix FSDP resume Initialization issue (#34032) * Fix FSDP Initialization for resume training * Added init_fsdp function to work with dummy values * Fix FSDP initialization for resuming training * Added CUDA decorator for tests * Added torch_gpu decorator to FSDP tests * Fixup for failing code quality tests --- src/transformers/trainer.py | 37 +++++++++++++++++++++++++++++++++++ tests/trainer/test_trainer.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 20b9f6dad231d1..5131676c953dc1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -273,6 +273,39 @@ def _get_fsdp_ckpt_kwargs(): return {} +def _init_fsdp(model, accelerator, device): + """ + Initialize Fully Sharded Data Parallel (FSDP) for the model. + + This function is needed to properly initialize FSDP when resuming from a checkpoint. + It runs a forward pass with dummy inputs to ensure FSDP is fully initialized. + See https://github.com/huggingface/transformers/issues/31892 for more details. + + Args: + model: The model to initialize with FSDP. + accelerator: The Accelerator object. + device: The device to run the model on. + + Returns: + The initialized FSDP model. + """ + model = accelerator.prepare(model) + model.train() + with torch.no_grad(): + # Run a forward pass with dummy inputs to initialize FSDP + dummy_input = { + name: torch.ones( + (1, 512), + dtype=torch.long, + device=device, + ) + for name in model.forward.__code__.co_varnames + if name != "self" + } + _ = model(**dummy_input) + return model + + if TYPE_CHECKING: import optuna @@ -601,6 +634,10 @@ def __init__( " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and" " `model.to(xm.xla_device())` is performed before the optimizer creation in your script." ) + + if self.is_fsdp_enabled: + self.model = _init_fsdp(self.model, self.accelerator, self.args.device) + if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and ( self.optimizer is not None or self.lr_scheduler is not None ): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index cbc93faf50e7a3..8feb5d92e89e43 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -4914,3 +4914,34 @@ def test_get_optimizer_group(self): param = next(model.parameters()) group = trainer.get_optimizer_group(param) self.assertIn(param, group["params"]) + + +@require_torch_gpu +@require_torch +@require_accelerate +class TestFSDPInitialization(unittest.TestCase): + def test_fsdp_initialization(self): + config = RegressionModelConfig(a=1, b=1, double_output=False) + model = RegressionPreTrainedModel(config) + + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = TrainingArguments( + output_dir=tmp_dir, + fsdp=True, + fsdp_config={"min_num_params": 1}, + no_cuda=True, + ) + trainer = Trainer(model=model, args=training_args) + + # Check for FSDP enabled + self.assertTrue(trainer.is_fsdp_enabled) + + # Check if model is wrapped with FSDP + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + + self.assertTrue(trainer.model, FSDP) + + # Running a forward pass to ensure FSDP is initialized + dummy_input = torch.ones((1, 1), dtype=torch.float) + output = trainer.model(dummy_input) + self.assertTrue(output) From 5ee9e786d115154c0c58dc961e39105a205ccac0 Mon Sep 17 00:00:00 2001 From: Subhalingam D Date: Tue, 15 Oct 2024 17:36:20 +0530 Subject: [PATCH 019/385] Fix default behaviour in TextClassificationPipeline for regression problem type (#34066) * update code * update docstrings * update tests --- src/transformers/pipelines/text_classification.py | 11 ++++++++--- tests/pipelines/test_pipelines_text_classification.py | 6 ++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py index 21ca70c2ac50aa..dadb29c386b41e 100644 --- a/src/transformers/pipelines/text_classification.py +++ b/src/transformers/pipelines/text_classification.py @@ -40,7 +40,8 @@ class ClassificationFunction(ExplicitEnum): The function to apply to the model outputs in order to retrieve the scores. Accepts four different values: - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model - has several labels, will apply the softmax function on the output. + has several labels, will apply the softmax function on the output. In case of regression tasks, will not + apply any function on the output. - `"sigmoid"`: Applies the sigmoid function on the output. - `"softmax"`: Applies the softmax function on the output. - `"none"`: Does not apply any function on the output.""", @@ -69,7 +70,8 @@ class TextClassificationPipeline(Pipeline): `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments). If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax - over the results. If there is a single label, the pipeline will run a sigmoid over the result. + over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression + tasks (`model.config.problem_type == "regression"`), will not apply any function on the output. The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See the up-to-date list of available models on @@ -135,6 +137,7 @@ def __call__(self, inputs, **kwargs): If this argument is not specified, then it will apply the following functions according to the number of labels: + - If problem type is regression, will not apply any function on the output. - If the model has a single label, will apply the sigmoid function on the output. - If the model has several labels, will apply the softmax function on the output. @@ -192,7 +195,9 @@ def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=Tr # the more natural result containing the list. # Default value before `set_parameters` if function_to_apply is None: - if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1: + if self.model.config.problem_type == "regression": + function_to_apply = ClassificationFunction.NONE + elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1: function_to_apply = ClassificationFunction.SIGMOID elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1: function_to_apply = ClassificationFunction.SOFTMAX diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py index 1f3b31b8583265..23625f0d77b39e 100644 --- a/tests/pipelines/test_pipelines_text_classification.py +++ b/tests/pipelines/test_pipelines_text_classification.py @@ -108,6 +108,12 @@ def test_small_model_pt(self): ], ) + # Do not apply any function to output for regression tasks + # hack: changing problem_type artifically (so keep this test at last) + text_classifier.model.config.problem_type = "regression" + outputs = text_classifier("This is great !") + self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.01}]) + @require_torch def test_accepts_torch_device(self): text_classifier = pipeline( From d314ce70bffc6d9df137ef14f490423fbacfba85 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 15 Oct 2024 13:32:09 +0100 Subject: [PATCH 020/385] Generate: move `logits` to same device as `input_ids` (#34076) tmp commit --- src/transformers/generation/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 09be2f6bc224ee..83a489bb13f36c 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -2623,6 +2623,7 @@ def _dola_decoding( next_token_logits = _dola_select_contrast( candidate_premature_layers, candidate_premature_logits, final_logits ) + next_token_logits = next_token_logits.to(input_ids.device) # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) @@ -2794,6 +2795,7 @@ def _contrastive_search( # (the clone itself is always small) # .float() is needed to retain precision for later logits manipulations logit_for_next_step = outputs.logits[:, -1, :].clone().float() + logit_for_next_step = logit_for_next_step.to(input_ids.device) model_kwargs = self._update_model_kwargs_for_generation( outputs, @@ -2988,6 +2990,7 @@ def _contrastive_search( next_past_key_values = tuple(new_key_values) logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :] + logit_for_next_step = logit_for_next_step.to(input_ids.device) # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration if self.config.is_encoder_decoder: @@ -3184,6 +3187,7 @@ def _sample( # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) next_token_logits = outputs.logits.clone()[:, -1, :].float() + next_token_logits = next_token_logits.to(input_ids.device) # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) @@ -3434,6 +3438,7 @@ def _beam_search( # (the clone itself is always small) # .float() is needed to retain precision for later logits manipulations next_token_logits = outputs.logits[:, -1, :].clone().float() + next_token_logits = next_token_logits.to(input_ids.device) next_token_scores = nn.functional.log_softmax( next_token_logits, dim=-1 ) # (batch_size * num_beams, vocab_size) @@ -3691,6 +3696,7 @@ def _group_beam_search( # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) raw_logit_score = outputs.logits[:, -1, :].clone() + raw_logit_score = raw_logit_score.to(input_ids.device) for beam_group_idx in range(num_beam_groups): group_start_idx = beam_group_idx * num_sub_beams @@ -3710,6 +3716,7 @@ def _group_beam_search( # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop # .float() is needed to retain precision for later logits manipulations next_token_logits = outputs.logits[batch_group_indices, -1, :].float() + next_token_logits = next_token_logits.to(input_ids.device) next_token_scores = nn.functional.log_softmax( next_token_logits, dim=-1 @@ -3967,6 +3974,7 @@ def _constrained_beam_search( # (the clone itself is always small) # .float() is needed to retain precision for later logits manipulations next_token_logits = outputs.logits[:, -1, :].clone().float() + next_token_logits = next_token_logits.to(input_ids.device) next_token_scores = nn.functional.log_softmax( next_token_logits, dim=-1 ) # (batch_size * num_beams, vocab_size) @@ -4215,6 +4223,7 @@ def _assisted_decoding( # 2.3. Process the new logits # .float() is needed to retain precision for later logits manipulations new_logits = outputs.logits[:, -candidate_length - 1 :].float() # excludes the input prompt if present + new_logits = new_logits.to(input_ids.device) next_token_logits = new_logits.clone() if len(logits_processor) > 0: for i in range(candidate_length + 1): From 65442718c478aed0183155cd69decb8fc7e47f5f Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:55:09 +0200 Subject: [PATCH 021/385] Add support for inheritance from class with different suffix in modular (#34077) * add support for different suffix in modular * add dummy example, pull new changes for modular * nide lines order change --- .../modeling_new_task_model.py | 546 ++++++++++++++++++ .../modular_new_task_model.py | 84 +++ utils/modular_model_converter.py | 77 ++- 3 files changed, 696 insertions(+), 11 deletions(-) create mode 100644 examples/modular-transformers/modeling_new_task_model.py create mode 100644 examples/modular-transformers/modular_new_task_model.py diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py new file mode 100644 index 00000000000000..640331ace1d57b --- /dev/null +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -0,0 +1,546 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_new_task_model.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_new_task_model.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from dataclasses import dataclass +from typing import ClassVar, List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...cache_utils import Cache, StaticCache +from ...generation import GenerationMixin +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + logging, + replace_return_docstrings, +) +from .configuration_new_task_model import NewTaskModelConfig + + +if is_flash_attn_2_available(): + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + +from ..auto import AutoModel, AutoModelForCausalLM + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "NewTaskModelConfig" + + +# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position +# But NewTaskModel has no causal mask on prefix +def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + min_dtype: float, + cache_position: torch.Tensor, + batch_size: int, + is_training: bool = False, + token_type_ids: torch.Tensor = None, +): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + min_dtype (`float`): + The minimum value representable with the dtype `dtype`. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + is_training (`bool`): + Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels` + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below + if sequence_length != 1: + if is_training: + causal_mask = torch.triu(causal_mask, diagonal=1) + else: + causal_mask[:, :sequence_length] = 0.0 + + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + # we are training thus we need to create a full mask on the image + prefix but causal on suffix + if is_training: + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0 + ) + return causal_mask + + +@dataclass +class NewTaskModelCausalLMOutputWithPast(ModelOutput): + """ + Base class for NewTaskModelcausal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +class NewTaskModelMultiModalProjector(nn.Module): + def __init__(self, config: NewTaskModelConfig): + super().__init__() + self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True) + + def forward(self, image_features): + hidden_states = self.linear(image_features) + + return hidden_states + + +NEW_TASK_MODEL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`NewTaskModelConfig`] or [`NewTaskModelVisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + NEW_TASK_MODEL_START_DOCSTRING, +) +class NewTaskModelPreTrainedModel(PreTrainedModel): + config_class = NewTaskModelConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["NewTaskModelMultiModalProjector"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + # important: this ported version of NewTaskModelisn't meant for training from scratch - only + # inference and fine-tuning + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if hasattr(module, "class_embedding"): + module.class_embedding.data.normal_(mean=0.0, std=std) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def _supports_sdpa(self): + """ + Retrieve language_model's attribute to check whether the model supports + SDPA or not. + """ + return self.language_model._supports_sdpa + + +NEW_TASK_MODEL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses + [`SiglipImageProcessor`] for processing images). + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + """The NEW_TASK_MODEL model which consists of a vision backbone and a language model.""", + NEW_TASK_MODEL_START_DOCSTRING, +) +class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): + main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related + + def __init__(self, config): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config=config.vision_config) + self.multi_modal_projector = NewTaskModelMultiModalProjector(config) + self.vocab_size = config.text_config.vocab_size + self._attn_implementation = config._attn_implementation + + language_model = AutoModelForCausalLM.from_config( + config=config.text_config, attn_implementation=self._attn_implementation + ) + + if language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] + self.language_model = language_model + + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + + self.embedding_dim = self.config.embedding_dim + self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys] + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def _update_causal_mask( + self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False + ): + using_static_cache = isinstance(past_key_values, StaticCache) + dtype = inputs_embeds.dtype + min_dtype = torch.finfo(dtype).min + sequence_length = inputs_embeds.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else cache_position[0] + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + return attention_mask + + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below + if sequence_length != 1: + if is_training: + causal_mask = torch.triu(causal_mask, diagonal=1) + else: + causal_mask[:, :sequence_length] = 0.0 + + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + # we are training thus we need to create a full mask on the image + prefix but causal on suffix + if is_training: + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0 + ) + return causal_mask + + @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, NewTaskModelForNewTask + + >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf") + >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf") + + >>> prompt = "answer en Where is the cow standing?" + >>> url = "https://huggingface.co/gv-hf/NewTaskModel-test-224px-hf/resolve/main/cow_beach_1.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_length=30) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "answer en Where is the cow standing?\nbeach" + ``` + Returns: + """ + vlm_outputs = super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + token_type_ids=token_type_ids, + cache_position=cache_position, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=True, + return_dict=True, + num_logits_to_keep=num_logits_to_keep, + ) + last_hidden_states = vlm_outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim) + + # L2 normalization + embeddings = proj / proj.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim) + + embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim) + + return (embeddings,) + vlm_outputs + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + pixel_values=None, + attention_mask=None, + token_type_ids=None, + use_cache=True, + num_logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_position=cache_position, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: + if model_inputs["inputs_embeds"] is not None: + batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape + device = model_inputs["inputs_embeds"].device + else: + batch_size, sequence_length = model_inputs["input_ids"].shape + device = model_inputs["input_ids"].device + + dtype = self.get_output_embeddings().weight.dtype + min_dtype = torch.finfo(dtype).min + + model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.get_max_length(), + dtype=dtype, + device=device, + min_dtype=min_dtype, + cache_position=cache_position, + batch_size=batch_size, + ) + + model_inputs["token_type_ids"] = token_type_ids + + # position_ids in NewTaskModel are 1-indexed + if model_inputs.get("position_ids") is not None: + model_inputs["position_ids"] += 1 + + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of=None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + + # Update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + + return model_embeds diff --git a/examples/modular-transformers/modular_new_task_model.py b/examples/modular-transformers/modular_new_task_model.py new file mode 100644 index 00000000000000..877fba00a50ff4 --- /dev/null +++ b/examples/modular-transformers/modular_new_task_model.py @@ -0,0 +1,84 @@ +from typing import ClassVar, List, Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration + +from ...cache_utils import Cache + + +class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration): + main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related + + def __init__(self, config): + super().__init__(config=config) + + self.embedding_dim = self.config.embedding_dim + self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys] + + self.post_init() + + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + ): + r""" + Returns: + """ + vlm_outputs = super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + token_type_ids=token_type_ids, + cache_position=cache_position, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=True, + return_dict=True, + num_logits_to_keep=num_logits_to_keep, + ) + last_hidden_states = vlm_outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim) + + # L2 normalization + embeddings = proj / proj.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim) + + embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim) + + return (embeddings,) + vlm_outputs + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of=None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + + # Update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + + return model_embeds diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index d2f0a99fc93885..c107a483186231 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -204,7 +204,15 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer): - LLaMa -> MyNewModel abd MyNewModel -> Llama """ - def __init__(self, old_name, new_name, given_old_name=None, given_new_name=None): + def __init__( + self, + old_name, + new_name, + given_old_name=None, + given_new_name=None, + old_class_name: str = None, + new_class_name: str = None, + ): super().__init__() self.old_name = old_name self.new_name = new_name @@ -220,6 +228,18 @@ def __init__(self, old_name, new_name, given_old_name=None, given_new_name=None) } if given_old_name is not None and given_new_name is not None and given_old_name not in self.patterns: self.patterns[given_old_name] = given_new_name + if self.old_name in CONFIG_MAPPING_NAMES: + self.default_old_name = CONFIG_MAPPING_NAMES[self.old_name].replace("Config", "") + if self.default_old_name.isupper(): + self.default_old_name = self.default_old_name.capitalize() + if new_class_name is not None and old_class_name is not None and old_class_name not in self.patterns: + # In last recourse, when the suffix of the new class is not the same as the old class, + # and if the old and new classes start with the default name, we keep the default class name + # and replace the old suffix with the new one. + # Useful when we have a class like `ColPaliForRetrieval` inheriting from `PaliGemmaForConditionalGeneration` + # where a model extends another model, but is used for a different task. + if old_class_name.startswith(self.default_old_name) and new_class_name.startswith(self.default_name): + self.patterns[old_class_name[len(self.default_old_name) :]] = new_class_name[len(self.default_name) :] def preserve_case_replace(self, text): # Create a regex pattern to match all variations @@ -235,7 +255,9 @@ def replace(match): def convert_to_camelcase(self, text): # Regex pattern to match consecutive uppercase letters and lowercase the first set - result = re.sub(r"^[A-Z]+(?=[A-Z][a-z])", lambda m: m.group(0).capitalize(), text, count=1) + result = re.sub( + rf"^({self.old_name})(?=[a-z]+)", lambda m: self.default_old_name, text, flags=re.IGNORECASE, count=1 + ) return result @m.leave(m.Name() | m.SimpleString() | m.Comment()) @@ -249,9 +271,24 @@ def leave_ClassDef(self, original_node, updated_node): return updated_node.with_changes(name=cst.Name(self.convert_to_camelcase(updated_node.name.value))) -def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma", given_old_name=None, given_new_name=None): +def find_classes_in_file( + module: cst.Module, + old_id="llama", + new_id="gemma", + given_old_name=None, + given_new_name=None, + old_class_name=None, + new_class_name=None, +): """Helper function to rename and then parse a source file using the ClassFinder""" - transformer = ReplaceNameTransformer(old_id, new_id, given_old_name, given_new_name) + transformer = ReplaceNameTransformer( + old_id, + new_id, + given_old_name=given_old_name, + given_new_name=given_new_name, + old_class_name=old_class_name, + new_class_name=new_class_name, + ) new_module = module.visit(transformer) wrapper = MetadataWrapper(new_module) @@ -868,7 +905,7 @@ def leave_ClassDef(self, original_node, updated_node): dep: class_finder.class_start_line.get(dep, 1000) for dep in class_finder.class_dependency_mapping.get(class_name, []) } - if list_dependencies == []: + if len(list_dependencies) == 0: # so, maybe standard renaming did not work (the class name is different) # we try with another renaming pattern potential_given_name = get_new_part(class_name, super_class) @@ -884,6 +921,30 @@ def leave_ClassDef(self, original_node, updated_node): dep: class_finder.class_start_line.get(dep, 1000) for dep in class_finder.class_dependency_mapping.get(class_name, []) } + if len(list_dependencies) == 0: + # last recourse, if the suffix of the new class is different from the one of the super class + # e.g. MyNewClassForSegmentation extends MyOldClassForObjectDetection + # we try with another renaming pattern + class_finder = find_classes_in_file( + self.transformers_imports[super_file_name], + model_name, + self.model_name, + self.given_old_name, + self.given_new_name, + super_class, + class_name, + ) + visited_module[super_file_name] = class_finder + list_dependencies = { + dep: class_finder.class_start_line.get(dep, 1000) + for dep in class_finder.class_dependency_mapping.get(class_name, []) + } + if len(list_dependencies) == 0: + raise ValueError( + f"We were unable to find dependencies for {class_name} (based on inheriting from {super_class})" + f" Here are all the global dependencies that we found in you modular file: {list(class_finder.class_dependency_mapping.keys())}." + f" This usually means that the name of `{class_name}` does not match the pattern of `{super_class}`" + ) list_dependencies = sorted(list_dependencies.items(), key=lambda x: x[1], reverse=True) start_insert_idx = self.global_scope_index @@ -917,12 +978,6 @@ def leave_ClassDef(self, original_node, updated_node): if len(list_dependencies) > 0: updated_node = replace_call_to_super(class_finder, updated_node, class_name, all_bases) - else: - raise ValueError( - f"We were unable to find dependencies for {class_name} (based on inheriting from {super_class})" - f" Here are all the global dependencies that we found in you modular file: {list(class_finder.class_dependency_mapping.keys())}." - f" This usually means that the name of `{class_name}` does not match the pattern of `{super_class}`" - ) # Now, if a class was defined without parents, we look for the name match_pattern = "|".join(TYPE_TO_FILE_TYPE.keys()) From d00f1ca860f19f4c0962882e56044bb6ef7b5626 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:42:07 +0200 Subject: [PATCH 022/385] Fix optuna ddp hp search (#34073) --- src/transformers/integrations/integration_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 5f0ac55d0eb5fd..4f7cf3632fe549 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -241,7 +241,8 @@ def _objective(trial, checkpoint_dir=None): if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED: raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.") trainer._hp_search_setup(trial) - torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0) + args_main_rank_list = [pickle.dumps(trainer.args)] + torch.distributed.broadcast_object_list(args_main_rank_list, src=0) trainer.train(resume_from_checkpoint=checkpoint) else: trainer.train(resume_from_checkpoint=checkpoint, trial=trial) @@ -267,11 +268,11 @@ def _objective(trial, checkpoint_dir=None): else: for i in range(n_trials): trainer.objective = None - args_main_rank = list(pickle.dumps(trainer.args)) + args_main_rank_list = [None] if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED: raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.") - torch.distributed.broadcast_object_list(args_main_rank, src=0) - args = pickle.loads(bytes(args_main_rank)) + torch.distributed.broadcast_object_list(args_main_rank_list, src=0) + args = pickle.loads(bytes(args_main_rank_list[0])) for key, value in asdict(args).items(): if key != "local_rank": setattr(trainer.args, key, value) From 0f49deacbff3e57cde45222842c0db6375e4fa43 Mon Sep 17 00:00:00 2001 From: laurentd-lunit <103402801+laurentd-lunit@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:19:18 +0900 Subject: [PATCH 023/385] [feat] LlavaNext add feature size check to avoid CUDA Runtime Error (#33608) * [feat] add feature size check to avoid CUDA Runtime Error * [minor] add error handling to all llava models * [minor] avoid nested if else * [minor] add error message to Qwen2-vl and chameleon * [fix] token dimension for check * [minor] add feature dim check for videos too * [fix] dimension check * [fix] test reference values --------- Co-authored-by: Raushan Turganbay --- .../models/chameleon/modeling_chameleon.py | 6 ++++++ src/transformers/models/llava/modeling_llava.py | 6 ++++++ .../models/llava_next/modeling_llava_next.py | 6 ++++++ .../llava_next_video/modeling_llava_next_video.py | 12 ++++++++++++ .../llava_next_video/modular_llava_next_video.py | 12 ++++++++++++ .../llava_onevision/modeling_llava_onevision.py | 14 ++++++++++++-- .../models/qwen2_vl/modeling_qwen2_vl.py | 12 ++++++++++++ .../models/video_llava/modeling_video_llava.py | 13 ++++++++++++- .../models/vipllava/modeling_vipllava.py | 6 ++++++ tests/models/llava/test_modeling_llava.py | 4 ++-- tests/models/vipllava/test_modeling_vipllava.py | 4 ++-- 11 files changed, 88 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index fd76c0b1152267..20dbfc317e133d 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1287,6 +1287,12 @@ def forward( if pixel_values is not None: image_tokens = self.get_image_tokens(pixel_values) + n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item() + n_image_features = image_tokens.shape[0] + if n_image_tokens_in_text != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}" + ) special_image_mask = input_ids == self.vocabulary_mapping.image_token_id image_tokens = image_tokens.to(input_ids.device, input_ids.dtype) input_ids = input_ids.masked_scatter(special_image_mask, image_tokens) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index e793ca61c750d7..411b96f5c57a50 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -518,6 +518,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 705821c2b713e8..75dfcf5393ea15 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -895,6 +895,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 7df4cf20372bb7..30257b84397814 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -967,6 +967,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: if image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) @@ -976,6 +982,12 @@ def forward( image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) if video_features is not None: + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) special_image_mask = ( (input_ids == self.config.video_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 4b6be407dcab81..e7de66de444af7 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -482,6 +482,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: if image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) @@ -491,6 +497,12 @@ def forward( image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) if video_features is not None: + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) special_image_mask = ( (input_ids == self.config.video_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index f65c0fe7cfb3e5..3eefb517b16d9f 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -619,7 +619,12 @@ def forward( image_newline=self.image_newline, vision_aspect_ratio=vision_aspect_ratio, ) - + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) @@ -647,7 +652,12 @@ def forward( image_newline = self.image_newline[None, None, :].repeat(batch_size, 1, 1).to(video_features.device) video_features = torch.cat((video_features, image_newline), dim=1) video_features = video_features.flatten(0, 1) - + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) special_video_mask = ( (input_ids == self.config.video_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 283e38d3a7d508..e014a6da6bb3bc 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1710,6 +1710,12 @@ def forward( if pixel_values is not None: pixel_values = pixel_values.type(self.visual.get_dtype()) image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + n_image_tokens = (input_ids == self.config.image_token_id).sum().item() + n_image_features = image_embeds.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) image_mask = ( (input_ids == self.config.image_token_id) .unsqueeze(-1) @@ -1722,6 +1728,12 @@ def forward( if pixel_values_videos is not None: pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype()) video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) + n_video_tokens = (input_ids == self.config.video_token_id).sum().item() + n_video_features = video_embeds.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) video_mask = ( (input_ids == self.config.video_token_id) .unsqueeze(-1) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 5711433c368d5e..20fa0166b80c9c 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -618,6 +618,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: if image_outputs is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) @@ -626,8 +632,13 @@ def forward( ) image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - if video_outputs is not None: + n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item() + n_video_features = video_features.shape[1] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) special_image_mask = ( (input_ids == self.config.video_token_index) .unsqueeze(-1) diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 26d92b9ac3dca4..76348228476757 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -511,6 +511,12 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) special_image_mask = ( (input_ids == self.config.image_token_index) .unsqueeze(-1) diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index e183c38a59f7d7..07415900bb93db 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -118,8 +118,8 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 336 - self.encoder_seq_length = 231 - self.num_image_tokens = 224 + self.encoder_seq_length = 232 + self.num_image_tokens = 225 self.seq_length = seq_length + self.num_image_tokens def get_config(self): diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index b12f2c30c774a0..862e144ecdd7d8 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -111,8 +111,8 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 336 - self.encoder_seq_length = 231 - self.num_image_tokens = 224 + self.encoder_seq_length = 232 + self.num_image_tokens = 225 self.seq_length = seq_length + self.num_image_tokens def get_config(self): From 67acb0b123f48d51ec8cfef14b45827b8b53976c Mon Sep 17 00:00:00 2001 From: Jiwook Han <33192762+mreraser@users.noreply.github.com> Date: Wed, 16 Oct 2024 02:31:44 +0900 Subject: [PATCH 024/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`vivit.md`=20to=20Korean=20(#33935)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: model_doc/vivit.md * feat: nmt draft * fix: manual edits * fix: manual edits --- docs/source/ko/_toctree.yml | 5 ++++ docs/source/ko/model_doc/vivit.md | 42 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 docs/source/ko/model_doc/vivit.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 43ff0017c8f842..97a8215059e45b 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -671,6 +671,11 @@ - local: in_translation title: (번역중) XLSR-Wav2Vec2 title: (번역중) 오디오 모델 + - isExpanded: false + sections: + - local: model_doc/vivit + title: ViViT + title: (번역중) 비디오 모델 - isExpanded: false sections: - local: in_translation diff --git a/docs/source/ko/model_doc/vivit.md b/docs/source/ko/model_doc/vivit.md new file mode 100644 index 00000000000000..c9eee17cb20afc --- /dev/null +++ b/docs/source/ko/model_doc/vivit.md @@ -0,0 +1,42 @@ + + +# Video Vision Transformer (ViViT) [[video-vision-transformer-vivit]] + +## 개요 [[overview]] + +Vivit 모델은 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid가 제안한 논문 [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691)에서 소개되었습니다. 이 논문은 비디오 이해를 위한 pure-transformer 기반의 모델 집합 중에서 최초로 성공한 모델 중 하나를 소개합니다. + +논문의 초록은 다음과 같습니다: + +*우리는 이미지 분류에서 최근 성공을 거둔 순수 트랜스포머 기반 모델을 바탕으로 비디오 분류를 위한 모델을 제안합니다. 본 모델은 입력 비디오로부터 시공간 토큰을 추출한 후, 이를 일련의 트랜스포머 레이어로 인코딩합니다. 비디오에서 발생하는 긴 토큰 시퀀스를 처리하기 위해, 입력의 공간 및 시간 차원을 분리하는 여러 효율적인 모델 변형을 제안합니다. 트랜스포머 기반 모델은 대규모 학습 데이터셋에서만 효과적이라는 것이 일반적이지만, 우리는 학습 중 모델을 효과적으로 정규화하고, 사전 학습된 이미지 모델을 활용함으로써 상대적으로 작은 데이터셋에서도 학습할 수 있는 방법을 보여줍니다. 또한, 철저한 소거(ablation) 연구를 수행하고 Kinetics 400 및 600, Epic Kitchens, Something-Something v2, Moments in Time을 포함한 여러 비디오 분류 벤치마크에서 최첨단 성과를 달성하여, 기존의 3D 합성곱 신경망 기반 방법들을 능가합니다.* + +이 모델은 [jegormeister](https://huggingface.co/jegormeister)가 기여하였습니다. 원본 코드(JAX로 작성됨)는 [여기](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit)에서 확인할 수 있습니다. + +## VivitConfig [[transformers.VivitConfig]] + +[[autodoc]] VivitConfig + +## VivitImageProcessor [[transformers.VivitImageProcessor]] + +[[autodoc]] VivitImageProcessor + - preprocess + +## VivitModel [[transformers.VivitModel]] + +[[autodoc]] VivitModel + - forward + +## VivitForVideoClassification [[transformers.VivitForVideoClassification]] + +[[autodoc]] transformers.VivitForVideoClassification + - forward From 8c33cf4eeca38bf8ec67bc2d0b50f818895a067f Mon Sep 17 00:00:00 2001 From: Yijun Lee <119404328+yijun-lee@users.noreply.github.com> Date: Wed, 16 Oct 2024 03:20:46 +0900 Subject: [PATCH 025/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`gemma2.md`=20to=20Korean=20(#33937)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: gemma2.md * feat: nmt draft * fix: manual edits * fix: resolve suggestions --- docs/source/ko/_toctree.yml | 2 + docs/source/ko/model_doc/gemma2.md | 63 ++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 docs/source/ko/model_doc/gemma2.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 97a8215059e45b..8ba65da887e21f 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -400,6 +400,8 @@ title: (번역중) Funnel Transformer - local: model_doc/gemma title: Gemma + - local: model_doc/gemma2 + title: Gemma2 - local: model_doc/openai-gpt title: GPT - local: in_translation diff --git a/docs/source/ko/model_doc/gemma2.md b/docs/source/ko/model_doc/gemma2.md new file mode 100644 index 00000000000000..6bffec616c6e97 --- /dev/null +++ b/docs/source/ko/model_doc/gemma2.md @@ -0,0 +1,63 @@ + + + +# Gemma2 [[gemma2]] + +## 개요 [[overview]] + +Gemma2 모델은 Google의 Gemma2 팀이 작성한 [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/google-gemma-2/)에서 제안되었습니다. +파라미터 크기가 각각 90억(9B)과 270억(27B)인 두 가지 Gemma2 모델이 출시되었습니다. + +블로그 게시물의 초록은 다음과 같습니다: + +*이제 우리는 전 세계의 연구자와 개발자들에게 Gemma 2를 공식적으로 출시합니다. 90억(9B)과 270억(27B) 파라미터 크기로 제공되는 Gemma 2는 1세대보다 더 높은 성능과 추론 효율성을 제공하며, 상당한 안전성 향상을 포함하고 있습니다. 사실 270억 규모의 모델은 크기가 두 배 이상인 모델과 비교해도 경쟁력 있는 대안을 제공하며, 이는 작년 12월까지만 해도 독점 모델에서만 가능했던 성능을 제공합니다.* + +팁: + +- 원본 체크포인트는 변환 스크립트 `src/transformers/models/Gemma2/convert_Gemma2_weights_to_hf.py`를 사용하여 변환할 수 있습니다. + + + +- Gemma2는 매 두 번째 레이어마다 슬라이딩 윈도우 어텐션을 사용하므로 [`~DynamicCache`] 또는 텐서의 튜플과 같은 일반적인 kv 캐싱에는 적합하지 않습니다. Gemma2의 forward 호출에서 캐싱을 활성화하려면 [`~HybridCache`] 인스턴스를 초기화하고 이를 `past_key_values`로 forward 호출에 전달해야 합니다. 또한 `past_key_values`에 이미 이전의 키와 값이 포함되어 있다면 `cache_position`도 준비해야 합니다. + + + +이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ), [Pedro Cuenca](https://huggingface.co/pcuenq), [Tom Arsen]()이 기여했습니다. + +## Gemma2Config [[transformers.Gemma2Config]] + +[[autodoc]] Gemma2Config + +## Gemma2Model [[transformers.Gemma2Model]] + +[[autodoc]] Gemma2Model + - forward + +## Gemma2ForCausalLM [[transformers.Gemma2ForCausalLM]] + +[[autodoc]] Gemma2ForCausalLM + - forward + +## Gemma2ForSequenceClassification [[transformers.Gemma2ForSequenceClassification]] + +[[autodoc]] Gemma2ForSequenceClassification + - forward + +## Gemma2ForTokenClassification [[transformers.Gemma2ForTokenClassification]] + +[[autodoc]] Gemma2ForTokenClassification + - forward \ No newline at end of file From 554ed5d1e0d8078c8bf2920c7c07ef8a14563716 Mon Sep 17 00:00:00 2001 From: Yijun Lee <119404328+yijun-lee@users.noreply.github.com> Date: Wed, 16 Oct 2024 03:21:05 +0900 Subject: [PATCH 026/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`trainer=5Futils.md`=20to=20Korean=20(#33817)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: trainer_utils.md * feat: nmt draft * fix: manual edits * fix: resolve suggestions Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com> --------- Co-authored-by: Woojun Jung <46880056+jungnerd@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/internal/trainer_utils.md | 49 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/internal/trainer_utils.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 8ba65da887e21f..31e91e467465b1 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -790,8 +790,8 @@ title: 파이프라인을 위한 유틸리티 - local: internal/tokenization_utils title: 토크나이저를 위한 유틸리티 - - local: in_translation - title: (번역중) Utilities for Trainer + - local: internal/trainer_utils + title: Trainer를 위한 유틸리티 - local: internal/generation_utils title: 생성을 위한 유틸리티 - local: internal/image_processing_utils diff --git a/docs/source/ko/internal/trainer_utils.md b/docs/source/ko/internal/trainer_utils.md new file mode 100644 index 00000000000000..15fb457331811c --- /dev/null +++ b/docs/source/ko/internal/trainer_utils.md @@ -0,0 +1,49 @@ + + +# Trainer를 위한 유틸리티 (Utilities for Trainer) [[utilities-for-trainer]] + +이 페이지는 [`Trainer`]에서 사용되는 모든 유틸리티 함수들을 나열합니다. + +이 함수들 대부분은 라이브러리에 있는 Trainer 코드를 자세히 알아보고 싶을 때만 유용합니다. + +## 유틸리티 (Utilities) [[transformers.EvalPrediction]] + +[[autodoc]] EvalPrediction + +[[autodoc]] IntervalStrategy + +[[autodoc]] enable_full_determinism + +[[autodoc]] set_seed + +[[autodoc]] torch_distributed_zero_first + +## 콜백 내부 (Callbacks internals) [[transformers.trainer_callback.CallbackHandler]] + +[[autodoc]] trainer_callback.CallbackHandler + +## 분산 평가 (Distributed Evaluation) [[transformers.trainer_pt_utils.DistributedTensorGatherer]] + +[[autodoc]] trainer_pt_utils.DistributedTensorGatherer + +## Trainer 인자 파서 (Trainer Argument Parser) [[transformers.HfArgumentParser]] + +[[autodoc]] HfArgumentParser + +## 디버그 유틸리티 (Debug Utilities) [[transformers.debug_utils.DebugUnderflowOverflow]] + +[[autodoc]] debug_utils.DebugUnderflowOverflow \ No newline at end of file From 9d6998c759fc08d6a87e81adf26e59d9c932396b Mon Sep 17 00:00:00 2001 From: "Chulhwa (Evan) Han" Date: Wed, 16 Oct 2024 03:21:22 +0900 Subject: [PATCH 027/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`blip-2.md`=20to=20Korean=20(#33516)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: model_doc/blip-2 * feat: nmt draft * Apply suggestions from code review Co-authored-by: Jiwook Han <33192762+mreraser@users.noreply.github.com> * Update docs/source/ko/model_doc/blip-2.md Co-authored-by: Yijun Lee <119404328+yijun-lee@users.noreply.github.com> --------- Co-authored-by: Jiwook Han <33192762+mreraser@users.noreply.github.com> Co-authored-by: Yijun Lee <119404328+yijun-lee@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/model_doc/blip-2.md | 98 ++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/model_doc/blip-2.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 31e91e467465b1..883db54c7a3cd7 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -684,10 +684,10 @@ title: (번역중) ALIGN - local: in_translation title: (번역중) AltCLIP + - local: model_doc/blip-2 + title: BLIP-2 - local: model_doc/blip title: BLIP - - local: in_translation - title: (번역중) BLIP-2 - local: in_translation title: (번역중) BridgeTower - local: model_doc/chameleon diff --git a/docs/source/ko/model_doc/blip-2.md b/docs/source/ko/model_doc/blip-2.md new file mode 100644 index 00000000000000..ae3da11d3a18c3 --- /dev/null +++ b/docs/source/ko/model_doc/blip-2.md @@ -0,0 +1,98 @@ + + +# BLIP-2[[blip-2]] + +## 개요[[overview]] +BLIP-2 모델은 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi의 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 논문에서 제안되었습니다. BLIP-2는 동결된 사전 학습 이미지 인코더와 대규모 언어 모델(LLM)을 연결하는 12층의 경량 Transformer 인코더를 학습시켜, 여러 비전-언어 작업에서 SOTA(현재 최고의 성능)을 달성했습니다. 특히, BLIP-2는 800억 개의 파라미터를 가진 Flamingo 모델보다 제로샷 VQAv2에서 8.7% 더 높은 성능을 기록했으며, 학습 가능한 파라미터 수는 Flamingo보다 54배 적습니다. + +논문의 초록은 다음과 같습니다: + +*비전-언어 사전 학습의 비용은 대규모 모델의 엔드-투-엔드 학습으로 인해 점점 더 부담스러워지고 있습니다. 본 논문은 사전 학습된 이미지 인코더와 대규모 언어 모델을 활용하여 비전-언어 사전 학습을 부트스트래핑하는 일반적이고 효율적인 사전 학습 전략인 BLIP-2를 제안합니다. BLIP-2는 경량화된 Querying Transformer를 통해 모달리티 간의 차이를 연결하며, 두 단계로 사전 학습됩니다. 첫 번째 단계는 동결된 이미지 인코더로부터 비전-언어 표현 학습을 부트스트래핑하고, 두 번째 단계는 동결된 언어 모델로부터 비전-언어 생성 학습을 부트스트래핑합니다. BLIP-2는 기존 방법들에 비해 훨씬 적은 학습 가능한 파라미터로 다양한 비전-언어 작업에서 최첨단 성능을 달성합니다. 예를 들어, 우리 모델은 제로샷 VQAv2에서 Flamingo80B보다 8.7% 높은 성능을 기록하며, 학습 가능한 파라미터 수는 54배 적습니다. 우리는 또한 자연어 명령을 따를 수 있는 제로샷 이미지-텍스트 생성의 새로운 기능을 입증했습니다.* + + + + BLIP-2 구조. 원본 논문 에서 발췌. + +이 모델은 [nielsr](https://huggingface.co/nielsr)가 기여했습니다. 원본 코드는 [여기](https://github.com/salesforce/LAVIS/tree/5ee63d688ba4cebff63acee04adaef2dee9af207)에서 확인할 수 있습니다. + +## 사용 팁[[usage-tips]] + +- BLIP-2는 이미지와 조건에 따라 텍스트 프롬프트를 입력받아 조건부 텍스트를 생성합니다. 추론 시 [`generate`] 메소드를 사용하는 것이 권장됩니다. +- [`Blip2Processor`]를 사용하여 모델에 이미지를 준비하고, 예측된 토큰 ID를 텍스트로 디코딩할 수 있습니다. + +## 자료[[resources]] + +BLIP-2를 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티(🌎 표시) 자료 목록입니다. + +- 이미지 캡셔닝, 시각 질문 응답(VQA), 채팅과 같은 대화형 작업을 위한 BLIP-2 데모 노트북은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BLIP-2)에서 찾을 수 있습니다. + +리소스를 제출하여 여기에 포함하고 싶다면 언제든지 풀 리퀘스트를 열어주세요! 리소스는 기존 리소스를 복제하지 않고 새로운 내용이어야 합니다. + +## Blip2Config[[transformers.Blip2Config]] + +[[autodoc]] Blip2Config + - from_vision_qformer_text_configs + +## Blip2VisionConfig[[transformers.Blip2VisionConfig]] + +[[autodoc]] Blip2VisionConfig + +## Blip2QFormerConfig[[transformers.Blip2QFormerConfig]] + +[[autodoc]] Blip2QFormerConfig + +## Blip2Processor[[transformers.Blip2Processor]] + +[[autodoc]] Blip2Processor + +## Blip2VisionModel[[transformers.Blip2VisionModel]] + +[[autodoc]] Blip2VisionModel + - forward + +## Blip2QFormerModel[[transformers.Blip2QFormerModel]] + +[[autodoc]] Blip2QFormerModel + - forward + +## Blip2Model[[transformers.Blip2Model]] + +[[autodoc]] Blip2Model + - forward + - get_text_features + - get_image_features + - get_qformer_features + +## Blip2ForConditionalGeneration[[transformers.Blip2ForConditionalGeneration]] + +[[autodoc]] Blip2ForConditionalGeneration + - forward + - generate + +## Blip2ForImageTextRetrieval[[transformers.Blip2ForImageTextRetrieval]] + +[[autodoc]] Blip2ForImageTextRetrieval + - forward + +## Blip2TextModelWithProjection[[transformers.Blip2TextModelWithProjection]] + +[[autodoc]] Blip2TextModelWithProjection + +## Blip2VisionModelWithProjection[[transformers.Blip2VisionModelWithProjection]] + +[[autodoc]] Blip2VisionModelWithProjection From d087165db08a2ebae6bca29ea014784197511ebd Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 16 Oct 2024 09:25:26 +0200 Subject: [PATCH 028/385] IDEFICS: support inputs embeds (#34043) * support embeds * use cache from config * style... * fix tests after rebase --- src/transformers/generation/utils.py | 8 ++--- .../models/idefics/modeling_idefics.py | 34 ++++++++++++------- tests/models/idefics/test_modeling_idefics.py | 6 ++++ .../models/idefics2/test_modeling_idefics2.py | 25 ++++++++++++++ .../models/idefics3/test_modeling_idefics3.py | 25 ++++++++++++++ tests/models/kosmos2/test_modeling_kosmos2.py | 6 ++++ tests/test_modeling_common.py | 14 ++++++-- 7 files changed, 100 insertions(+), 18 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 83a489bb13f36c..f9ab6fce6cf2eb 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -857,7 +857,7 @@ def _get_logits_processor( self, unconditional_ids=negative_prompt_ids, unconditional_attention_mask=negative_prompt_attention_mask, - use_cache=model_kwargs["use_cache"], + use_cache=generation_config.use_cache, ) ) if generation_config.sequence_bias is not None: @@ -2004,10 +2004,7 @@ def generate( # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are # generating the first new token or not, and we only want to use the embeddings for the first new token) if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": - model_kwargs["use_cache"] = True generation_config.use_cache = True - else: - model_kwargs["use_cache"] = generation_config.use_cache if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( @@ -2116,6 +2113,9 @@ def generate( generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs ) + # Set model_kwargs `use_cache` so we can use it later in forward runs + model_kwargs["use_cache"] = generation_config.use_cache + # 10. go into different generation modes if generation_mode == GenerationMode.ASSISTED_GENERATION: if generation_config.num_return_sequences > 1: diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 81159ee1c0cd30..4dd5f36a93e166 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1663,19 +1663,31 @@ def forward( def prepare_inputs_for_generation( self, input_ids, - past_key_values=None, attention_mask=None, position_ids=None, + inputs_embeds=None, + past_key_values=None, + cache_position=None, pixel_values=None, image_hidden_states=None, image_attention_mask=None, use_cache=None, - cache_position=None, **kwargs, ): + model_inputs = {} + if image_hidden_states is not None: + if self.config.use_resampler: + model_inputs["perceiver_embeddings"] = image_hidden_states + else: + model_inputs["image_encoder_embeddings"] = image_hidden_states + else: + model_inputs["pixel_values"] = pixel_values + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: - if input_ids.shape[1] != cache_position.shape[0]: + if inputs_embeds is not None: + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: input_ids = input_ids[:, cache_position] if image_attention_mask is not None: image_attention_mask = image_attention_mask[:, -input_ids.shape[1] :] @@ -1690,19 +1702,17 @@ def prepare_inputs_for_generation( # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. position_ids = position_ids.clone(memory_format=torch.contiguous_format) - model_inputs = {} - image_hidden_states = kwargs.pop("image_hidden_states", None) - if image_hidden_states is not None: - if self.config.use_resampler: - model_inputs["perceiver_embeddings"] = image_hidden_states - else: - model_inputs["image_encoder_embeddings"] = image_hidden_states + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs.update({"inputs_embeds": inputs_embeds, "input_ids": None}) else: - model_inputs["pixel_values"] = pixel_values + # The clone here is for the same reason as for `position_ids`. + model_inputs.update( + {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + ) model_inputs.update( { - "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache, "cache_position": cache_position, diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 62b6ca22293b6f..250c47c3a7e8ce 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -772,6 +772,12 @@ def test_contrastive_generate_low_memory(self): def test_custom_4d_attention_mask(self): pass + @unittest.skip( + reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" + ) + def test_generate_from_inputs_embeds_decoder_only(self): + pass + @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") def test_generate_compile_fullgraph(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index f87e87607c2a17..4071fcbb232805 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -539,6 +539,31 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) + def test_inputs_embeds_matches_input_ids_with_generate(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + + wte = model.get_input_embeddings() + + input_ids = inputs["input_ids"] + # some models infer position ids/attn mask differently when input ids + # by check if pad_token let's make sure no padding is in input ids + not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 + input_ids[input_ids == pad_token_id] = not_pad_token_id + del inputs["input_ids"] + inputs_embeds = wte(input_ids) + out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) + out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) + + self.assertTrue(torch.allclose(out_embeds, out_ids)) + @require_torch class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 44e06b07c54752..f0366e7b539a50 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -526,6 +526,31 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) + def test_inputs_embeds_matches_input_ids_with_generate(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + + wte = model.get_input_embeddings() + + input_ids = inputs["input_ids"] + # some models infer position ids/attn mask differently when input ids + # by check if pad_token let's make sure no padding is in input ids + not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 + input_ids[input_ids == pad_token_id] = not_pad_token_id + del inputs["input_ids"] + inputs_embeds = wte(input_ids) + out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) + out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) + + self.assertTrue(torch.allclose(out_embeds, out_ids)) + @require_torch class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 396a4179388fb8..22cbffcfdb6b13 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -428,6 +428,12 @@ def check_same_values(layer_1, layer_2): # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) + @unittest.skip( + "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten" + ) + def test_inputs_embeds_matches_input_ids_with_generate(self): + pass + @slow def test_model_from_pretrained(self): model_name = "microsoft/kosmos-2-patch14-224" diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 38c1f5ff1774b7..da33bbb48c5a36 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3000,8 +3000,11 @@ def test_inputs_embeds_matches_input_ids(self): def test_inputs_embeds_matches_input_ids_with_generate(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_generative_model_classes: - if model_class.__name__ not in get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES): + for model_class in self.all_model_classes: + if model_class.__name__ not in [ + *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), + *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), + ]: continue model = model_class(config) model.to(torch_device) @@ -3018,6 +3021,13 @@ def test_inputs_embeds_matches_input_ids_with_generate(self): inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged + # embeds already + if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): + inputs.pop("pixel_values", None) + inputs.pop("pixel_values_videos", None) + inputs.pop("pixel_values_images", None) + wte = model.get_input_embeddings() if not self.is_encoder_decoder: input_ids = inputs["input_ids"] From 9ba021ea758429e5bed27a0a405dabf565235802 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:21:49 +0200 Subject: [PATCH 029/385] Moshi integration (#33624) * clean mimi commit * some nits suggestions from Arthur * make fixup * first moshi WIP * converting weights working + configuration + generation configuration * finalize converting script - still missing tokenizer and FE and processor * fix saving model w/o default config * working generation * use GenerationMixin instead of inheriting * add delay pattern mask * fix right order: moshi codes then user codes * unconditional inputs + generation config * get rid of MoshiGenerationConfig * blank user inputs * update convert script:fix conversion, add tokenizer, feature extractor and bf16 * add and correct Auto classes * update modeling code, configuration and tests * make fixup * fix some copies * WIP: add integration tests * add dummy objects * propose better readiblity and code organisation * update tokenization tests * update docstrigns, eval and modeling * add .md * make fixup * add MoshiForConditionalGeneration to ignore Auto * revert mimi changes * re * further fix * Update moshi.md * correct md formating * move prepare causal mask to class * fix copies * fix depth decoder causal * fix and correct some tests * make style and update .md * correct config checkpoitn * Update tests/models/moshi/test_tokenization_moshi.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/models/moshi/test_tokenization_moshi.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * make style * Update src/transformers/models/moshi/__init__.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fixup * change firm in copyrights * udpate config with nested dict * replace einsum * make style * change split to True * add back splt=False * remove tests in convert * Update tests/models/moshi/test_modeling_moshi.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * add default config repo + add model to FA2 docstrings * remove logits float * fix some tokenization tests and ignore some others * make style tokenization tests * update modeling with sliding window + update modeling tests * [run-slow] moshi * remove prepare for generation frol CausalLM * isort * remove copied from * ignore offload tests * update causal mask and prepare 4D mask aligned with recent changes * further test refine + add back prepare_inputs_for_generation for depth decoder * correct conditional use of prepare mask * update slow integration tests * fix multi-device forward * remove previous solution to device_map * save_load is flaky * fix generate multi-devices * fix device * move tensor to int --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Marc Sun --- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/mimi.md | 2 +- docs/source/en/model_doc/moshi.md | 183 ++ docs/source/en/perf_infer_gpu_one.md | 2 + src/transformers/__init__.py | 22 + src/transformers/convert_slow_tokenizer.py | 41 + src/transformers/generation/utils.py | 4 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/moshi/__init__.py | 27 + .../models/moshi/configuration_moshi.py | 333 ++ .../moshi/convert_moshi_transformers.py | 311 ++ .../models/moshi/modeling_moshi.py | 2813 +++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 28 + tests/generation/test_utils.py | 3 + tests/models/mimi/test_modeling_mimi.py | 2 + tests/models/moshi/__init__.py | 0 tests/models/moshi/test_modeling_moshi.py | 1126 +++++++ tests/models/moshi/test_tokenization_moshi.py | 447 +++ utils/check_repo.py | 1 + 24 files changed, 5353 insertions(+), 2 deletions(-) create mode 100644 docs/source/en/model_doc/moshi.md create mode 100644 src/transformers/models/moshi/__init__.py create mode 100644 src/transformers/models/moshi/configuration_moshi.py create mode 100644 src/transformers/models/moshi/convert_moshi_transformers.py create mode 100644 src/transformers/models/moshi/modeling_moshi.py create mode 100644 tests/models/moshi/__init__.py create mode 100644 tests/models/moshi/test_modeling_moshi.py create mode 100644 tests/models/moshi/test_tokenization_moshi.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 02595f30db2893..016d7279353d95 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -740,6 +740,8 @@ title: Mimi - local: model_doc/mms title: MMS + - local: model_doc/moshi + title: Moshi - local: model_doc/musicgen title: MusicGen - local: model_doc/musicgen_melody diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 32a730e6bcfca8..bdea11a2456fef 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -223,6 +223,7 @@ Flax), PyTorch, and/or TensorFlow. | [MobileNetV2](model_doc/mobilenet_v2) | ✅ | ❌ | ❌ | | [MobileViT](model_doc/mobilevit) | ✅ | ✅ | ❌ | | [MobileViTV2](model_doc/mobilevitv2) | ✅ | ❌ | ❌ | +| [Moshi](model_doc/moshi) | ✅ | ❌ | ❌ | | [MPNet](model_doc/mpnet) | ✅ | ✅ | ❌ | | [MPT](model_doc/mpt) | ✅ | ❌ | ❌ | | [MRA](model_doc/mra) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md index 486d1836334949..ad15a002da9166 100644 --- a/docs/source/en/model_doc/mimi.md +++ b/docs/source/en/model_doc/mimi.md @@ -66,4 +66,4 @@ The original code can be found [here](https://github.com/kyutai-labs/moshi). [[autodoc]] MimiModel - decode - encode - - forward + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md new file mode 100644 index 00000000000000..64216f570e3ed0 --- /dev/null +++ b/docs/source/en/model_doc/moshi.md @@ -0,0 +1,183 @@ + + +# Moshi + +## Overview + +The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. + +Moshi is a speech-text foundation model that casts spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. Moshi also predicts time-aligned text tokens as a prefix to audio tokens. This “Inner Monologue” method significantly improves the linguistic quality of generated speech and provides streaming speech recognition and text-to-speech. As a result, Moshi is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice. + +
+ +
+ +The abstract from the paper is the following: + +*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* + +Moshi deals with 3 streams of information: +1. The user's audio +2. Moshi's audio +3. Moshi's textual output + +Similarly to [`~MusicgenModel`], audio is represented with audio codebooks, which can be interpreted like tokens. The main difference between text tokens and audio codebooks is that audio codebooks introduce an additional dimension of information. +Text tokens are typically of dim `(batch_size, sequence_length)` but audio tokens are of dim `(batch_size, num_codebooks, sequence_length)`. + +Moshi's made of 3 components: + +**1. The main decoder (Helium in the paper)** + +It corresponds to [`MoshiForCausalLM`]. It is strictly a classic text LLM, that uses an architecture similar to [` ~GemmaForCausalLM`]. In other words, it takes text tokens, embeds them, pass them through the decoder and a language head, to get text logits. + +**2. The depth decoder** + +On its own, it's also a classic LLM, but this time, instead of generating over the time dimension, it generates over the codebook dimension. + +It also means that its context length is `num_codebooks`, thus it can't generate more than `num_codebooks`. + +Note that each timestamp - i.e each codebook - gets its own set of Linear Layers and Embeddings. + +**3. [`MimiModel`]** + +It's the audio encoder from Kyutai, that has recently been integrated to transformers, which is used to "tokenize" audio. It has the same use that [`~EncodecModel`] has in [`~MusicgenModel`]. + + +## Tips: + +The original checkpoints can be converted using the conversion script `src/transformers/models/moshi/convert_moshi_transformers.py` + + +### How to use the model: + +This implementation has two main aims: +1. quickly test model generation by simplifying the original API +2. simplify training. A training guide will come soon, but user contributions are welcomed! + + + +It is designed for intermediate use. We strongly recommend using the original [implementation](https://github.com/kyutai-labs/moshi) to infer the model in real-time streaming. + + + +**1. Model generation** + +Moshi is a streaming auto-regressive model with two streams of audio. To put it differently, one audio stream corresponds to what the model said/will say and the other audio stream corresponds to what the user said/will say. + +[`MoshiForConditionalGeneration.generate`] thus needs 3 inputs: +1. `input_ids` - corresponding to the text token history +2. `moshi_input_values` or `moshi_audio_codes`- corresponding to the model audio history +3. `user_input_values` or `user_audio_codes` - corresponding to the user audio history + +These three inputs must be synchronized. Meaning that their lengths must correspond to the same number of tokens. + +You can dynamically use the 3 inputs depending on what you want to test: +1. Simply check the model response to an user prompt - in that case, `input_ids` can be filled with pad tokens and `user_input_values` can be a zero tensor of the same shape than the user prompt. +2. Test more complex behaviour - in that case, you must be careful about how the input tokens are synchronized with the audios. + + + +The original model is synchronized text with audio by padding the text in between each token enunciation. + +To follow the example of the following image, `"Hello, I'm Moshi"` could be transformed to `"Hello,I'm Moshi"`. + + + +
+ +
+ + +[`MoshiForConditionalGeneration.generate`] then auto-regressively feeds to itself its own audio stream, but since it doesn't have access to the user input stream while using `transformers`, it will thus **assume that the user is producing blank audio**. + + + +```python +>>> from datasets import load_dataset, Audio +>>> import torch, math +>>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer +>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + +>>> # prepare user input audio +>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) +>>> audio_sample = librispeech_dummy[-1]["audio"]["array"] +>>> user_input_values = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt").to(device=device, dtype=dtype) + +>>> # prepare moshi input values - we suppose moshi didn't say anything while the user spoke +>>> moshi_input_values = torch.zeros_like(user_input_values.input_values) + +>>> # prepare moshi input ids - we suppose moshi didn't say anything while the user spoke +>>> num_tokens = math.ceil(moshi_input_values.shape[-1] * waveform_to_token_ratio) +>>> input_ids = torch.ones((1, num_tokens), device=device, dtype=torch.int64) * tokenizer.encode("")[0] + +>>> # generate 25 new tokens (around 2s of audio) +>>> output = model.generate(input_ids=input_ids, user_input_values=user_input_values.input_values, moshi_input_values=moshi_input_values, max_new_tokens=25) + +>>> text_tokens = output.sequences +>>> audio_waveforms = output.audio_sequences +``` + +**2. Model training** + +Most of the work has to be done during data creation/pre-processing, because of the need to align/synchronize streams. + +Once it's done, you can simply forward `text_labels` and `audio_labels` to [`MoshiForConditionalGeneration.forward`], alongside the usual inputs, to get the model loss. + +A training guide will come soon, but user contributions are welcomed! + +### How does the model forward the inputs / generate: + +1. The input streams are embedded and combined into `inputs_embeds`. + +2. `inputs_embeds` is passed through the main decoder, which processes it like a normal LLM would. + +3. The main decoder outputs `text logits` but also its `last hidden state` which is called `temporal context` in the paper. + +3. The depth decoder switches the dimension on which we forward / generate (codebooks instead of time). It uses the token generated from `text logits` and the `temporal context` to auto-regressively generate audio codebooks. + + +This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe). + +The original code can be found [here](https://github.com/kyutai-labs/moshi). + + + +## MoshiConfig + +[[autodoc]] MoshiConfig + +## MoshiDepthConfig + +[[autodoc]] MoshiDepthConfig + +## MoshiModel + +[[autodoc]] MoshiModel + - forward + +## MoshiForCausalLM + +[[autodoc]] MoshiForCausalLM + - forward + +## MoshiForConditionalGeneration + +[[autodoc]] MoshiForConditionalGeneration + - forward + - generate + - get_unconditional_inputs diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 82d7f50f77d902..2f0e9deb841d4d 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -70,6 +70,7 @@ FlashAttention-2 is currently supported for the following architectures: * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) @@ -241,6 +242,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index daffe11987ef5d..236333fb1cbd37 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -590,6 +590,10 @@ "models.mobilenet_v2": ["MobileNetV2Config"], "models.mobilevit": ["MobileViTConfig"], "models.mobilevitv2": ["MobileViTV2Config"], + "models.moshi": [ + "MoshiConfig", + "MoshiDepthConfig", + ], "models.mpnet": [ "MPNetConfig", "MPNetTokenizer", @@ -2783,6 +2787,14 @@ "MobileViTV2PreTrainedModel", ] ) + _import_structure["models.moshi"].extend( + [ + "MoshiForCausalLM", + "MoshiForConditionalGeneration", + "MoshiModel", + "MoshiPreTrainedModel", + ] + ) _import_structure["models.mpnet"].extend( [ "MPNetForMaskedLM", @@ -5448,6 +5460,10 @@ from .models.mobilevitv2 import ( MobileViTV2Config, ) + from .models.moshi import ( + MoshiConfig, + MoshiDepthConfig, + ) from .models.mpnet import ( MPNetConfig, MPNetTokenizer, @@ -7386,6 +7402,12 @@ MobileViTV2Model, MobileViTV2PreTrainedModel, ) + from .models.moshi import ( + MoshiForCausalLM, + MoshiForConditionalGeneration, + MoshiModel, + MoshiPreTrainedModel, + ) from .models.mpnet import ( MPNetForMaskedLM, MPNetForMultipleChoice, diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 92371415918150..f37f589d5d53e0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1405,6 +1405,47 @@ def converted(self) -> Tokenizer: return tokenizer +class MoshiConverter(SpmConverter): + handle_byte_fallback = True + + def __init__(self, vocab_file, model_max_length=None, **kwargs): + requires_backends(self, "protobuf") + + Converter.__init__(self, vocab_file) + + # from .utils import sentencepiece_model_pb2 as model_pb2 + model_pb2 = import_protobuf() + + m = model_pb2.ModelProto() + with open(vocab_file, "rb") as f: + m.ParseFromString(f.read()) + self.proto = m + + def normalizer(self, proto): + precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap + _normalizers = [ + normalizers.Replace(" ", "▁"), + ] + if not precompiled_charsmap: + return normalizers.Sequence(_normalizers) + else: + return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers) + + def decoder(self, replacement, add_prefix_space): + sequence = [ + decoders.Replace("▁", " "), + decoders.ByteFallback(), + decoders.Fuse(), + ] + if add_prefix_space: + sequence += [decoders.Strip(content=" ", left=1)] + return decoders.Sequence(sequence) + + def pre_tokenizer(self, replacement, add_prefix_space): + prepend_scheme = "first" + return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False) + + # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index f9ab6fce6cf2eb..6d71b754d6f4e5 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1594,8 +1594,10 @@ def _get_cache( cache_dtype = self.get_output_embeddings().weight.dtype def get_layer_device_map(execution_device_map: Optional[dict] = None): - if execution_device_map is None or len(execution_device_map) <= 1: + if execution_device_map is None: return None + elif len(execution_device_map) == 1 and "" in execution_device_map: + return {idx: execution_device_map[""] for idx in range(self.config.num_hidden_layers)} layer_device_map = {} for layer in execution_device_map: for idx in range(self.config.num_hidden_layers): diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 804957c0a551ae..069c7f90564fce 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -161,6 +161,7 @@ mobilenet_v2, mobilevit, mobilevitv2, + moshi, mpnet, mpt, mra, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 17219570684d53..05d6e717be23d2 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -179,6 +179,7 @@ ("mobilenet_v2", "MobileNetV2Config"), ("mobilevit", "MobileViTConfig"), ("mobilevitv2", "MobileViTV2Config"), + ("moshi", "MoshiConfig"), ("mpnet", "MPNetConfig"), ("mpt", "MptConfig"), ("mra", "MraConfig"), @@ -490,6 +491,7 @@ ("mobilenet_v2", "MobileNetV2"), ("mobilevit", "MobileViT"), ("mobilevitv2", "MobileViTV2"), + ("moshi", "Moshi"), ("mpnet", "MPNet"), ("mpt", "MPT"), ("mra", "MRA"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 98d679ef09c75b..0ddab5681f2e46 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -73,6 +73,7 @@ ("mobilenet_v1", "MobileNetV1FeatureExtractor"), ("mobilenet_v2", "MobileNetV2FeatureExtractor"), ("mobilevit", "MobileViTFeatureExtractor"), + ("moshi", "EncodecFeatureExtractor"), ("nat", "ViTFeatureExtractor"), ("owlvit", "OwlViTFeatureExtractor"), ("perceiver", "PerceiverFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index dbfcccaa4684dc..5a98e761adc13b 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -169,6 +169,7 @@ ("mobilenet_v2", "MobileNetV2Model"), ("mobilevit", "MobileViTModel"), ("mobilevitv2", "MobileViTV2Model"), + ("moshi", "MoshiModel"), ("mpnet", "MPNetModel"), ("mpt", "MptModel"), ("mra", "MraModel"), @@ -506,6 +507,7 @@ ("mistral", "MistralForCausalLM"), ("mixtral", "MixtralForCausalLM"), ("mllama", "MllamaForCausalLM"), + ("moshi", "MoshiForCausalLM"), ("mpt", "MptForCausalLM"), ("musicgen", "MusicgenForCausalLM"), ("musicgen_melody", "MusicgenMelodyForCausalLM"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 8c3a7a82a60a51..3a3428e0995147 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -309,6 +309,7 @@ ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)), ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), + ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/moshi/__init__.py b/src/transformers/models/moshi/__init__.py new file mode 100644 index 00000000000000..69da6e940ea643 --- /dev/null +++ b/src/transformers/models/moshi/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_moshi import * + from .modeling_moshi import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py new file mode 100644 index 00000000000000..654e4e82a491b7 --- /dev/null +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -0,0 +1,333 @@ +# coding=utf-8 +# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Moshi model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto.configuration_auto import AutoConfig + + +logger = logging.get_logger(__name__) + + +class MoshiDepthConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MoshiDepthDecoder`]. It is used to instantiate a + Moshi depth decoder model according to the specified arguments, defining the Moshi depth decoder config. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MoshiDepthDecoder model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`MoshiDepthDecoder`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer of the depth decoder. + input_size (`int`, *optional*, defaults to 4096): + Dimensionality of the input hidden states. Used to connect the main decoder to the depth decoder. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of depth decoder layers. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the depth decoder block. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. + audio_vocab_size (`int`, *optional*, defaults to 2048): + Vocabulary size of the audio part of model. Defines the number of different tokens that can be + represented by the `audio_codes` passed when calling the Moshi models. + max_position_embeddings (`int`, *optional*, defaults to 9): + The maximum sequence length that this model might ever be used with. Typically, set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the depth decoder. + head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): + The attention head dimension. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + sliding_window (`int`, *optional*, defaults to 8): + Sliding window attention window size. If not specified, will default to `8`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + ffn_dim (`int`, *optional*, defaults to 5632): + Dimensionality of the "intermediate" (often named feed-forward) layer in the depth decoder block. Must be even. + rms_norm_eps (`float`, *optional*, defaults to 1e-08): + The epsilon used by the rms normalization layers. + num_codebooks (`int`, *optional*, defaults to 8): + The number of audio codebooks for each audio channels. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + kwargs (*optional*): + Dictionary of keyword arguments. Notably: + - **audio_encoder_config** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that + defines the audio encoder config. + + Example: + + ```python + >>> from transformers import ( + ... MoshiDepthConfig, + ... MoshiDepthDecoder, + ... ) + + >>> configuration = MoshiDepthConfig() + + >>> # Initializing a MoshiDepthDecoder (with random weights) from the kmhf/hf-moshiko style configuration + >>> model = MoshiDepthDecoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "moshi_depth" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=1024, + input_size=4096, + num_hidden_layers=6, + num_attention_heads=16, + num_key_value_heads=None, + audio_vocab_size=2048, + max_position_embeddings=9, + hidden_act="silu", + head_dim=None, + initializer_range=0.02, + use_cache=True, + sliding_window=8, + attention_dropout=0.0, + ffn_dim=5632, + rms_norm_eps=1e-8, + num_codebooks=8, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.input_size = input_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.hidden_act = hidden_act + self.head_dim = head_dim or hidden_size // num_attention_heads + self.initializer_range = initializer_range + self.use_cache = use_cache + self.sliding_window = sliding_window + self.attention_dropout = attention_dropout + if ffn_dim % 2 == 1: + raise ValueError(f"`ffn_dim={ffn_dim}` must be even.") + self.ffn_dim = ffn_dim + self.rms_norm_eps = rms_norm_eps + self.num_codebooks = num_codebooks + self.audio_vocab_size = audio_vocab_size + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class MoshiConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MoshiModel`]. It is used to instantiate a + Moshi model according to the specified arguments, defining the audio encoder, Moshi depth decoder and Moshi decoder + configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the Moshiko model, + e.g. [kmhf/hf-moshiko](https://huggingface.co/kmhf/hf-moshiko) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MoshiDecoder model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`MoshiDecoder`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimensionality of the layers and the pooler layer of the main decoder. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of decoder layers. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the main decoder block. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. + audio_vocab_size (`int`, *optional*): + Vocabulary size of the audio part of model. Defines the number of different tokens that can be + represented by the `audio_codes` passed when calling the Moshi models. + max_position_embeddings (`int`, *optional*, defaults to 3000): + The maximum sequence length that this model might ever be used with. Typically, set this to something large + just in case (e.g., 512 or 1024 or 2048). + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): + The attention head dimension. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + sliding_window (`int`, *optional*, defaults to 3000): + Sliding window attention window size. If not specified, will default to `3000`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + ffn_dim (`int`, *optional*, defaults to 22528): + Dimensionality of the "intermediate" (often named feed-forward) layer in the main decoder block. Must be even. + rms_norm_eps (`float`, *optional*, defaults to 1e-08): + The epsilon used by the rms normalization layers. + num_codebooks (`int`, *optional*, defaults to 8): + The number of audio codebooks for each audio channels. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + kwargs (*optional*): + Dictionary of keyword arguments. Notably: + - **audio_encoder_config** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that + defines the audio encoder config. + - **depth__config** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that + defines the depth decoder config. + + + Example: + + ```python + >>> from transformers import ( + ... MoshiConfig, + ... MoshiForConditionalGeneration, + ... ) + + >>> configuration = MoshiConfig() + + >>> # Initializing a MoshiForConditionalGeneration (with random weights) from the kmhf/hf-moshiko style configuration + >>> model = MoshiForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Saving the model, including its configuration + >>> model.save_pretrained("kmhf/hf-moshiko") + + >>> # loading model and config from pretrained folder + >>> moshi_config = MoshiConfig.from_pretrained("kmhf/hf-moshiko") + >>> model = MoshiForConditionalGeneration.from_pretrained("kmhf/hf-moshiko", config=moshi_config) + ```""" + + model_type = "moshi" + is_composition = True + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + audio_vocab_size=None, + max_position_embeddings=3000, + rope_theta=10000.0, + hidden_act="silu", + head_dim=None, + initializer_range=0.02, + use_cache=True, + sliding_window=3000, + attention_dropout=0.0, + ffn_dim=22528, + rms_norm_eps=1e-8, + num_codebooks=8, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.hidden_act = hidden_act + self.head_dim = head_dim or hidden_size // num_attention_heads + self.initializer_range = initializer_range + self.use_cache = use_cache + self.sliding_window = sliding_window + self.attention_dropout = attention_dropout + if ffn_dim % 2 == 1: + raise ValueError(f"`ffn_dim={ffn_dim}` must be even.") + self.ffn_dim = ffn_dim + self.rms_norm_eps = rms_norm_eps + self.num_codebooks = num_codebooks + + audio_encoder_config = kwargs.pop("audio_encoder_config", {}) + audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") + + self.audio_encoder_config = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config) + + if self.num_codebooks > self.audio_encoder_config.num_codebooks: + raise ValueError( + f"`num_codebooks={num_codebooks}` is greater than the maximum number of codebooks that the audio encoder can deal with ({self.audio_encoder_config.num_codebooks}). Please lower it." + ) + + self.audio_vocab_size = ( + self.audio_encoder_config.codebook_size if audio_vocab_size is None else audio_vocab_size + ) + + depth_decoder_config = kwargs.pop("depth_decoder_config", {}) + depth_decoder_config.update( + { + "audio_vocab_size": self.audio_vocab_size, + "input_size": hidden_size, + "vocab_size": vocab_size, + "num_codebooks": num_codebooks, + } + ) + + self.depth_decoder_config = MoshiDepthConfig(**depth_decoder_config) + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + @property + def sampling_rate(self): + return self.audio_encoder_config.sampling_rate + + @classmethod + def from_audio_encoder_config( + cls, + audio_encoder_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`MoshiConfig`] (or a derived class) from an audio encoder configuration. + + Returns: + [`MoshiConfig`]: An instance of a configuration object + """ + + return cls( + audio_encoder_config=audio_encoder_config.to_dict(), + **kwargs, + ) + + +__all__ = ["MoshiConfig", "MoshiDepthConfig"] diff --git a/src/transformers/models/moshi/convert_moshi_transformers.py b/src/transformers/models/moshi/convert_moshi_transformers.py new file mode 100644 index 00000000000000..1caaee25ef6fa0 --- /dev/null +++ b/src/transformers/models/moshi/convert_moshi_transformers.py @@ -0,0 +1,311 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Moshi checkpoints.""" + +import argparse + +import safetensors +import sentencepiece +import torch + +from transformers import ( + AutoFeatureExtractor, + GenerationConfig, + MimiModel, # initial audio encoder + MoshiConfig, + MoshiForConditionalGeneration, + PreTrainedTokenizerFast, + logging, +) +from transformers.convert_slow_tokenizer import MoshiConverter + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.mimi") + + +def assert_param_count(model_1, model_2): + count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0]) + count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0]) + assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}" + + +def param_count(model): + return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0]) + + +def _grab_best_device(use_gpu=True): + if torch.cuda.device_count() > 0 and use_gpu: + device = "cuda" + else: + device = "cpu" + return torch.device(device) + + +convert_list = [ + # GENERAL + ("out_norm", "decoder.model.norm"), + ("depformer_emb", "depth_decoder.emb"), + ("depformer_text_emb", "depth_decoder.text_emb"), + ("text_emb", "decoder.model.emb"), + ("emb", "embed_tokens"), + ("text_linear", "decoder.lm_head"), + ("depformer", "depth_decoder"), + ("transformer", "decoder.model"), + # TRANSFORMERS PART + ("gating.linear_in", "mlp.fc1"), + ("gating.linear_out", "mlp.fc2"), + ("self_attn.out_proj", "self_attn.o_proj.linear"), + ("norm1", "input_layernorm"), + ("norm2", "post_attention_layernorm"), + ("layer_scale_1", "self_attn_layer_scale"), + ("layer_scale_2", "mlp_layer_scale"), + ("alpha", "weight"), +] + + +def _preprocess_state_dict(state_dict, config): + # Moshi original weights are using a gating mechanism + + # pattern for depth transformer: + # stack(gating.{i}.linear_in)->mlp.fc1 + # stack(gating.{i}.linear_out)->mlp.fc2 + + for layer_idx in range(config.depth_decoder_config.num_hidden_layers): + linear_layers_in = [ + state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_in.weight") + for i in range(config.num_codebooks) + ] + linear_layers_out = [ + state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_out.weight") + for i in range(config.num_codebooks) + ] + + state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc1.weight"] = torch.stack(linear_layers_in) + state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc2.weight"] = torch.stack(linear_layers_out) + + input_projections = [] + lm_heads = [] + for codebook_idx in range(config.num_codebooks): + input_projections.append(state_dict.pop(f"depformer_in.{codebook_idx}.weight")) + lm_heads.append(state_dict.pop(f"linears.{codebook_idx}.weight")) + + state_dict["depth_decoder.input_projections.weight"] = torch.stack(input_projections, dim=0) + state_dict["depth_decoder.lm_heads.weight"] = torch.stack(lm_heads, dim=0) + + return state_dict + + +def _convert_model( + state_dict, + hf_model, + convert_list, + device, + config, + unwanted_prefix=None, +): + hidden_size = config.hidden_size + head_dim = config.head_dim + num_heads = int(config.hidden_size // config.head_dim) + num_key_value_heads = config.num_key_value_heads + key_value_head_dim = config.num_key_value_heads * head_dim + + state_dict = _preprocess_state_dict(state_dict, config) + + # permute for sliced rotary + def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size): + return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + for k, v in list(state_dict.items()): + if "audio_encoder" not in k: + new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :] + for old_layer_name, new_layer_name in convert_list: + if old_layer_name in new_k: + new_k = new_k.replace(old_layer_name, new_layer_name) + + if "alpha" in k: + state_dict[k] = state_dict[k].squeeze() + + if "in_proj_weight" in new_k: + # split qkv into query key and value + mixed_qkv = state_dict.pop(k) + if "depth_decoder" in new_k: + mixed_qkv = mixed_qkv.view(config.num_codebooks, -1, mixed_qkv.shape[-1]) + + qkv_dim = mixed_qkv.size(1) // 3 + + query_layer = mixed_qkv[:, :qkv_dim] + key_layer = mixed_qkv[:, qkv_dim : qkv_dim * 2] + value_layer = mixed_qkv[:, qkv_dim * 2 :] + state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = query_layer + state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = key_layer + + else: + qkv_dim = mixed_qkv.size(0) // 3 + + query_layer = mixed_qkv[:qkv_dim] + key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] + value_layer = mixed_qkv[qkv_dim * 2 :] + state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = permute( + query_layer, num_heads, hidden_size, hidden_size + ) + state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = permute( + key_layer, num_key_value_heads, key_value_head_dim, hidden_size + ) + + state_dict[new_k.replace("in_proj_weight", "v_proj.linear.weight")] = value_layer + elif "o_proj" in new_k and "depth_decoder" in new_k: + output_layer = state_dict.pop(k) + state_dict[new_k] = output_layer.view(config.num_codebooks, -1, output_layer.shape[-1]) + else: + state_dict[new_k] = state_dict.pop(k) + + # Do the last one by hand + state_dict["depth_decoder.text_embed_tokens.weight"] = state_dict.pop( + "depth_decoder.decoder.model.embed_tokens.weight" + ) + + extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys()) + missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys()) + if len(extra_keys) != 0: + raise ValueError(f"extra keys found: {extra_keys}") + if len(missing_keys) != 0: + raise ValueError(f"missing keys: {missing_keys}") + hf_model.load_state_dict(state_dict, strict=True) + n_params = param_count(hf_model) + + logger.info(f"model loaded: {round(n_params/1e6,1)}M params") + + hf_model.eval() + hf_model.to(device) + del state_dict + + return hf_model + + +@torch.no_grad() +def convert_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + mimi_repo_id, + config_path=None, + repo_id=None, +): + """ + Copy/paste/tweak model's weights to transformers design. + """ + device = _grab_best_device() + + mimi_model = MimiModel.from_pretrained(mimi_repo_id, torch_dtype=torch.bfloat16) + + if config_path is not None: + config = MoshiConfig.from_pretrained(config_path) + else: + audio_encoder_config = mimi_model.config + config = MoshiConfig.from_audio_encoder_config(audio_encoder_config) + + model = MoshiForConditionalGeneration(config).to(torch.bfloat16) + + depth_decoder_generation_config = GenerationConfig( + do_sample=True, + temperature=0.8, + top_k=250, + min_length=config.num_codebooks + 1, + max_length=config.num_codebooks + 1, + cache_implementation="sliding_window", + ) + + generation_config = GenerationConfig( + do_sample=True, + temp=0.7, + top_k=25, + cache_implementation="sliding_window", + pad_token_id=config.vocab_size, + bos_token_id=config.vocab_size, + ) + generation_config.depth_decoder_config = depth_decoder_generation_config.to_diff_dict() + + model.generation_config = generation_config + + original_checkpoint = safetensors.torch.load_file(checkpoint_path) + if "best_state" in original_checkpoint: + # we might have a training state saved, in which case discard the yaml results and just retain the weights + original_checkpoint = original_checkpoint["best_state"] + + audio_checkpoint = mimi_model.state_dict() + original_checkpoint.update({f"audio_encoder.{key}": value for (key, value) in audio_checkpoint.items()}) + + model = _convert_model(original_checkpoint, model, convert_list, device, config) + + model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--tokenizer_vocab_path", required=False, default=None, type=str, help="Path to original tokenizer vocab file" + ) + parser.add_argument("--mimi_repo_id", required=True, default=None, type=str, help="Repository id to HF Mimi.") + parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + + # convert tokenizer + if args.tokenizer_vocab_path: + original_tokenizer = sentencepiece.SentencePieceProcessor(args.tokenizer_vocab_path) + tokenizer = MoshiConverter(args.tokenizer_vocab_path).converted() + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=tokenizer, + chat_template=None, + unk_token="", + model_input_names=["input_ids", "attention_mask"], + clean_up_tokenization_spaces=False, + bos_token_id=original_tokenizer.bos_id(), + eos_token_id=original_tokenizer.eos_id(), + pad_token_id=original_tokenizer.pad_id(), + ) + + tokenizer.save_pretrained(args.pytorch_dump_folder_path) + + if args.push_to_hub: + print("Pushing the tokenizer to the hub...") + tokenizer.push_to_hub(args.push_to_hub) + + # upload feature extractor + feature_extractor = AutoFeatureExtractor.from_pretrained(args.mimi_repo_id) + feature_extractor.save_pretrained(args.pytorch_dump_folder_path) + + if args.push_to_hub: + print("Pushing the feature extractor to the hub...") + feature_extractor.push_to_hub(args.push_to_hub) + + convert_checkpoint( + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.mimi_repo_id, + args.config_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py new file mode 100644 index 00000000000000..5746a5934bd31f --- /dev/null +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -0,0 +1,2813 @@ +# coding=utf-8 +# Copyright 2024 Kyutai and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Moshi model.""" + +import math +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache +from ...generation import ( + GenerationConfig, + GenerationMixin, +) +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + ModelOutput, + Seq2SeqLMOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ALL_LAYERNORM_LAYERS +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + is_torchdynamo_compiling, + logging, + replace_return_docstrings, +) +from ..auto.modeling_auto import AutoModel +from .configuration_moshi import MoshiConfig, MoshiDepthConfig + + +if is_flash_attn_2_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MoshiConfig" +_CHECKPOINT_FOR_DOC = "kmhf/hf-moshiko" + + +@dataclass +class MoshiConditionalGenerationGenerateOutput(ModelOutput): + """ + Outputs of [`MoshiForConditionalConditionalGeneration.generate`]. + + Args: + audio_sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, 1, sequence_length)`, *optional*): + The generated audio waveforms. + sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): + The generated text sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter + if all batches finished early due to the `eos_token_id`. + sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`): + Final beam scores of the generated `sequences`. + scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`): + Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting + of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. + Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), + with each tensor of shape `(batch_size*num_beams, config.vocab_size)`. + logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`): + Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for + each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. + beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`): + Beam indices of generated token id at each generation step. `torch.LongTensor` of shape + `(batch_size*num_return_sequences, sequence_length)`. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. + hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. + past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`): + Returns the model cache, used to speed up decoding. Different models have a different cache format, check + the model's documentation. Usually, a [`~cache_utils.Cache`] instance. + audio_codes (`torch.LongTensor` of shape `(batch_size*num_return_sequences, num_codeooks, sequence_length)`, *optional*): + The generated audio codes. Returned if `return_audio_codes=True`. Intermediate audio "tokens" which transforms to `audio_sequences` once passed through the audio decoder. + """ + + audio_sequences: Optional[torch.Tensor] = None + sequences: torch.LongTensor = None + sequences_scores: Optional[torch.FloatTensor] = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + beam_indices: Optional[torch.LongTensor] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + audio_codes: Optional[torch.LongTensor] = None + + +@dataclass +class MoshiCausalLMOutputWithPast(ModelOutput): + """ + `MoshiForCausalLM` outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class MoshiConditionalGenerationOutputWithPast(ModelOutput): + """ + `MoshiForConditionalGeneration` outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `text_labels` is provided): + Text language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the text language modeling head (scores for each vocabulary token before SoftMax). + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + depth_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `audio_labels` is provided): + Audio language modeling loss (for next-token prediction). + audio_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the audio language modeling heads. + depth_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Past key-values of the depth decoder. + depth_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Hidden states of the depth decoder + depth_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Depth decoder's Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + depth_loss: Optional[torch.FloatTensor] = None + audio_logits: torch.FloatTensor = None + depth_past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + depth_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + depth_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class MoshiUnconditionalInput(ModelOutput): + """ + Args: + input_ids (`torch.Tensor `of shape `(batch_size, sequence_length), *optional*): + The sequence used as a text prompt for the generation. + user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio user prompt for the generation. Has priority over `user_input_values` and represents the audio "tokens" of `user_input_values` once passed through the audio encoder. + moshi_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio Moshi prompt for the generation. Has priority over `moshi_input_values` and represents the audio "tokens" of `moshi_input_values` once passed through the audio encoder. + attention_mask (`torch.LongTensor`) of shape `(batch_size, sequence_length)`, *optional*): + Attention mask to avoid performing attention on padding token indices. Mask values selected in `[0, + 1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**. + """ + + input_ids: torch.LongTensor = None + user_audio_codes: torch.Tensor = None + moshi_audio_codes: torch.Tensor = None + attention_mask: torch.LongTensor = None + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaRMSNorm with Gemma->Moshi +class MoshiRMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) # Ignore copy + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + # Ignore copy + def forward(self, x): + output = self._norm(x.float()) + output = output * self.weight.float() + return output.type_as(x) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + + +ALL_LAYERNORM_LAYERS.append(MoshiRMSNorm) + + +class MoshiFlexibleLinear(nn.Module): + def __init__(self, input_size, output_size, num_layers): + super().__init__() + # Stack the weights for N layers into a single tensor (num_layers, output_size, input_size) + self.weight = nn.Parameter(torch.randn(num_layers, output_size, input_size)) + + def forward(self, x, layer_idx=None): + """ + `MoshiFlexibleLinear` creates one linear layer per codebook. There's multiple ways to use it. + In the default case, `sequence_length=num_layers`, so each element of the sequence will be matmul to the weights corresponding to its index on the sequence. + + For more advanced cases, one can specify which codebook's layer(s) to use with `layer_idx`. + If `layer_idx` indicates a single integer, all of the element of the sequence will be matmul to this single codebook's layer. + But if `layer_idx` is a tensor of shape `(seq_length,)`, it will matmul each i-th element of the input sequence to the corresponding layer `weight[i]`. + + + Args: + x (`torch.FloatTensor): input to the layer of shape `(batch, num_layers, embed_dim)` or of shape `(batch, seq_length, embed_dim)` + layer_idx (`torch.Tensor`, *optional*): + Can be used to specify which codebook's layers(s) to use. + If it's a tensor of shape `(seq_length,)`, will matmul each element of the sequence to the corresponding weights. + But if `layer_idx` is a tensor of shape `(seq_length,)`, it will matmul each i-th element of the input sequence to the corresponding layer `weight[i]`. + """ + + # Use torch.gather to select the corresponding weights for each sample + # (codebooks, output_size, hidden_size) + selected_weights = torch.index_select(self.weight, 0, layer_idx) if layer_idx is not None else self.weight + + # (1, codebooks, hidden_size, output_size) + selected_weights = selected_weights.transpose(1, 2)[None, :, :, :] + + # (batch_size, codebooks, 1, hidden_size) x (1, codebooks, hidden_size, output_size) + # -> (batch_size, codebooks, 1, output_size) + x = torch.matmul(x[:, :, None, :], selected_weights) + + # (batch_size, codebooks, output_size) + return x.squeeze(2) + + +class MoshiLinear(nn.Module): + def __init__(self, input_dim, output_dim, num_codebooks, use_flexible_linear=False): + super().__init__() + + self.use_flexible_linear = use_flexible_linear + + if not use_flexible_linear: + self.linear = nn.Linear(input_dim, output_dim, bias=False) + else: + self.linear = MoshiFlexibleLinear(input_dim, output_dim, num_layers=num_codebooks) + + def forward(self, x, layer_idx=None): + if self.use_flexible_linear: + return self.linear(x, layer_idx) + else: + return self.linear(x) + + +# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Moshi +class MoshiRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + @torch.no_grad() + # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # TODO(joao): add me back asap :) + def forward(self, x, position_ids): + # x: [bs, num_attention_heads, seq_len, head_size] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MoshiGatingMLP(nn.Module): + def __init__(self, config, use_flexible_linear=False): + super().__init__() + + self.activation_fn = ACT2FN[config.hidden_act] + ffn_dim = config.ffn_dim + hidden_size = config.hidden_size + num_layers = config.num_codebooks if use_flexible_linear else 1 + if num_layers == 1: + self.fc1 = nn.Linear(hidden_size, ffn_dim, bias=False) + self.fc2 = nn.Linear(ffn_dim // 2, hidden_size, bias=False) + else: + self.fc1 = MoshiFlexibleLinear(hidden_size, ffn_dim, num_layers) + self.fc2 = MoshiFlexibleLinear(ffn_dim // 2, hidden_size, num_layers) + + def forward(self, hidden_states: torch.Tensor, layer_idx: int = None) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) if layer_idx is None else self.fc1(hidden_states, layer_idx) + + batch_size, sequence_length, _ = hidden_states.shape + hidden_states = hidden_states.view(batch_size, sequence_length, 2, -1) + hidden_states = self.activation_fn(hidden_states[..., 0, :]) * hidden_states[..., 1, :] + hidden_states = self.fc2(hidden_states) if layer_idx is None else self.fc2(hidden_states, layer_idx) + return hidden_states + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MoshiAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: MoshiConfig, layer_idx: Optional[int] = None, use_flexible_linear=False, use_rope=True): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + self.scaling = 1 / math.sqrt(self.head_dim) + + if self.hidden_size % self.num_heads != 0: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = MoshiLinear( + self.hidden_size, self.num_heads * self.head_dim, config.num_codebooks, use_flexible_linear + ) + self.k_proj = MoshiLinear( + self.hidden_size, self.num_key_value_heads * self.head_dim, config.num_codebooks, use_flexible_linear + ) + self.v_proj = MoshiLinear( + self.hidden_size, self.num_key_value_heads * self.head_dim, config.num_codebooks, use_flexible_linear + ) + self.o_proj = MoshiLinear( + self.num_heads * self.head_dim, self.hidden_size, config.num_codebooks, use_flexible_linear + ) + + # rotary embeddings are not used in the depth decoder + self.rotary_emb = None + if use_rope: + self.rope_theta = config.rope_theta + self.rotary_emb = MoshiRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + # Copied from transformers.models.gemma.modeling_gemma.GemmaAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states, cache_position) # Ignore copy + key_states = self.k_proj(hidden_states, cache_position) # Ignore copy + value_states = self.v_proj(hidden_states, cache_position) # Ignore copy + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if self.rotary_emb is not None: # Ignore copy + cos, sin = self.rotary_emb(value_states, position_ids) # Ignore copy + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) # Ignore copy + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = ( + {"sin": sin, "cos": cos, "cache_position": cache_position} + if self.rotary_emb is not None + else {"cache_position": cache_position} + ) # Ignore copy + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output, cache_position) # Ignore copy + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaFlashAttention2 with Gemma->Moshi +class MoshiFlashAttention2(MoshiAttention): + """ + Moshi flash attention module. This module inherits from `MoshiAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if isinstance(past_key_value, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states, cache_position) # Ignore copy + key_states = self.k_proj(hidden_states, cache_position) # Ignore copy + value_states = self.v_proj(hidden_states, cache_position) # Ignore copy + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if self.rotary_emb is not None: # Ignore copy + cos, sin = self.rotary_emb(value_states, position_ids) # Ignore copy + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) # Ignore copy + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = ( + {"sin": sin, "cos": cos, "cache_position": cache_position} + if self.rotary_emb is not None + else {"cache_position": cache_position} + ) # Ignore copy + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (MoshiRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output, cache_position) # Ignore copy + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->Moshi +class MoshiSdpaAttention(MoshiAttention): + """ + Moshi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MoshiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MoshiAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MoshiModel is using MoshiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states, cache_position) # Ignore copy + key_states = self.k_proj(hidden_states, cache_position) # Ignore copy + value_states = self.v_proj(hidden_states, cache_position) # Ignore copy + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if self.rotary_emb is not None: # Ignore copy + cos, sin = self.rotary_emb(value_states, position_ids) # Ignore copy + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) # Ignore copy + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = ( + {"sin": sin, "cos": cos, "cache_position": cache_position} + if self.rotary_emb is not None + else {"cache_position": cache_position} + ) # Ignore copy + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output, cache_position) # Ignore copy + + return attn_output, None, past_key_value + + +MOSHI_ATTENTION_CLASSES = { + "eager": MoshiAttention, + "flash_attention_2": MoshiFlashAttention2, + "sdpa": MoshiSdpaAttention, +} + + +class MoshiDecoderLayer(nn.Module): + def __init__(self, config: MoshiConfig, layer_idx: int, use_flexible_linear: bool, use_rope=True): + super().__init__() + self.hidden_size = config.hidden_size + self.use_flexible_linear = use_flexible_linear + + self.self_attn = MOSHI_ATTENTION_CLASSES[config._attn_implementation]( + config=config, layer_idx=layer_idx, use_flexible_linear=use_flexible_linear, use_rope=use_rope + ) + + self.mlp = MoshiGatingMLP(config, use_flexible_linear) + self.input_layernorm = MoshiRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MoshiRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.sliding_window = config.sliding_window + + self._attn_implementation = config._attn_implementation + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = ( + self.mlp(hidden_states) if not self.use_flexible_linear else self.mlp(hidden_states, cache_position) + ) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class MoshiPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MoshiConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MoshiDecoderLayer", "MimiTransformerLayer"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + main_input_name = "input_ids" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MOSHI_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MoshiConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +MOSHI_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence text tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + user_input_values (`torch.Tensor `of shape `(batch_size, 1, audio_sequence_length), *optional*): + The audio waveforms used as audio user prompt for the generation. + user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio user prompt for the generation. Has priority over `user_input_values` and represents the audio "tokens" of `user_input_values` once passed through the audio encoder. + moshi_input_values (`torch.Tensor `of shape `(batch_size, 1, audio_sequence_length), *optional*): + The audio waveforms used as audio Moshi prompt for the generation. + moshi_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio Moshi prompt for the generation. Has priority over `moshi_input_values` and represents the audio "tokens" of `moshi_input_values` once passed through the audio encoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `input_ids` and `inputs_embeds` are both unset, `inputs_embeds` takes the value + of `inputs_embeds`. + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + text_labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for text language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + audio_labels (`torch.LongTensor` of shape `(batch_size, num_codebooks, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.audio_vocab_size]` + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +MOSHI_DECODER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin): + """ + Transformer depth decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoshiTransformerLayer`] + + Args: + config: MoshiConfig + """ + + config_class = MoshiDepthConfig + + def __init__(self, config: MoshiDepthConfig): + super().__init__(config) + + self.text_embed_tokens = nn.Embedding(config.vocab_size + 1, config.hidden_size) + + # the last codebook is never used as input + self.embed_tokens = nn.ModuleList( + [nn.Embedding(config.audio_vocab_size + 1, config.hidden_size) for _ in range(config.num_codebooks - 1)] + ) + + self.input_projections = MoshiFlexibleLinear(config.input_size, config.hidden_size, config.num_codebooks) + + self.layers = nn.ModuleList( + [ + MoshiDecoderLayer(config, layer_idx, use_flexible_linear=True, use_rope=False) + for layer_idx in range(config.num_hidden_layers) + ] + ) + + self.lm_heads = MoshiFlexibleLinear(config.hidden_size, config.audio_vocab_size, config.num_codebooks) + self._attn_implementation = config._attn_implementation + self.gradient_checkpointing = False + self.config = config + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + last_hidden_state: torch.LongTensor = None, + attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Tuple[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + position_ids: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + """ + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens. The first element of the sequence must the text token associated to the audio codebooks. + The rest of the elements must be flatten audio codebooks. The `cache_position` argument can be used to indicate to which index is associated each token. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the main decoder. Used to contextualize `input_ids` + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert the inputs into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if use_cache and past_key_values is None and not self.training: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + past_seen_tokens = 0 if past_key_values is None else past_key_values.get_seq_length() + if cache_position is None: + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + input_ids.shape[1], device=input_ids.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # If inputs_embeds is provided, it has the priority over input_ids, which won't be used + if inputs_embeds is None: + inputs_embeds = [] + for position_idx in cache_position: + position_idx = position_idx.item() + if position_idx == 0: + inputs_embeds.append(self.text_embed_tokens(input_ids[:, [position_idx]])) + else: + inputs_embeds.append( + self.embed_tokens[(position_idx - 1)](input_ids[:, [position_idx - past_seen_tokens]]) + ) + + inputs_embeds = torch.cat(inputs_embeds, dim=1) + + inputs_embeds += self.input_projections(last_hidden_state, cache_position) + + causal_mask = None + if attention_mask is not None: + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + hidden_states = inputs_embeds + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + logits = self.lm_heads(hidden_states, cache_position) + + loss = None + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + loss_fct = CrossEntropyLoss() + + labels = labels.masked_fill(labels == self.config.audio_vocab_size, -100).reshape(-1) + # Enable model parallelism + labels = labels.to(logits.device) + loss = loss_fct(logits.reshape(-1, self.config.audio_vocab_size), labels) + + if not return_dict: + return tuple(v for v in [loss, logits, next_cache, all_hidden_states, all_self_attns] if v is not None) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if ( + self.config._attn_implementation == "sdpa" + and not (using_static_cache or using_sliding_window_cache) + and not output_attentions + ): + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + sliding_window=self.config.sliding_window, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + # SlidingWindowCache or StaticCache + if using_sliding_window_cache or using_static_cache: + target_length = past_key_values.get_max_cache_shape() + # DynamicCache or no cache + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + config=self.config, + past_key_values=past_key_values, + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->MoshiDepth + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + config: MoshiDepthConfig, + past_key_values: Cache, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + config (`MoshiDepthConfig`): + The model's configuration class + past_key_values (`Cache`): + The cache class that is being used currently to generate + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + if config.sliding_window is not None: + # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also + # the check is needed to verify is current checkpoint was trained with sliding window or not + if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: + sliding_attend_mask = torch.arange(target_length, device=device) <= ( + cache_position.reshape(-1, 1) - config.sliding_window + ) + diagonal_attend_mask |= sliding_attend_mask + causal_mask *= diagonal_attend_mask + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.shape[-1] > target_length: + attention_mask = attention_mask[:, :target_length] + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + return causal_mask + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + use_cache: bool = True, + num_logits_to_keep: Optional[int] = None, + **kwargs, + ): + """ + Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or + slicing inputs given the existing cache. + See the documentation in the used model for the arguments (different models might have different requirements + for e.g. `past_key_values`). Should work as is for most LLMs. + """ + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + if past_key_values is not None: + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s + # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the + # decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, + # `position_ids` is already contiguous but with varying stride which retriggers a capture. + position_ids = position_ids.clone(memory_format=torch.contiguous_format) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # The clone here is for the same reason as for `position_ids`. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + + if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: + if model_inputs["inputs_embeds"] is not None: + batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape + device = model_inputs["inputs_embeds"].device + else: + batch_size, sequence_length = model_inputs["input_ids"].shape + device = model_inputs["input_ids"].device + + # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create + # the 4D causal mask exists, it should be present in the base model (XXXModel class). + attention_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.max_cache_len, + dtype=self.text_embed_tokens.weight.dtype, + device=device, + cache_position=cache_position, + batch_size=batch_size, + config=self.config, + past_key_values=past_key_values, + ) + + if num_logits_to_keep is not None: + model_inputs["num_logits_to_keep"] = num_logits_to_keep + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "last_hidden_state": kwargs.get("last_hidden_state"), + } + ) + return model_inputs + + +@add_start_docstrings( + "The bare Moshi Model outputting raw hidden-states without any specific head on top.", + MOSHI_START_DOCSTRING, +) +class MoshiModel(MoshiPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoshiDecoderLayer`] + + Args: + config: MoshiConfig + """ + + def __init__(self, config: MoshiConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size + 1, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [ + MoshiDecoderLayer(config, layer_idx, use_flexible_linear=False) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = MoshiRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MOSHI_DECODER_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + return_legacy_cache = False # noqa: F841 + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = True # noqa: F841 + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = None + if attention_mask is not None: + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # embed positions + hidden_states = inputs_embeds + + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = True + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + ) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if ( + self.config._attn_implementation == "sdpa" + and not (using_static_cache or using_sliding_window_cache) + and not output_attentions + ): + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + sliding_window=self.config.sliding_window, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + # SlidingWindowCache or StaticCache + if using_sliding_window_cache or using_static_cache: + target_length = past_key_values.get_max_cache_shape() + # DynamicCache or no cache + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + config=self.config, + past_key_values=past_key_values, + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Moshi + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + config: MoshiConfig, + past_key_values: Cache, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + config (`MoshiConfig`): + The model's configuration class + past_key_values (`Cache`): + The cache class that is being used currently to generate + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + if config.sliding_window is not None: + # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also + # the check is needed to verify is current checkpoint was trained with sliding window or not + if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: + sliding_attend_mask = torch.arange(target_length, device=device) <= ( + cache_position.reshape(-1, 1) - config.sliding_window + ) + diagonal_attend_mask |= sliding_attend_mask + causal_mask *= diagonal_attend_mask + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.shape[-1] > target_length: + attention_mask = attention_mask[:, :target_length] + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + return causal_mask + + +@add_start_docstrings( + "The Moshi decoder model with a text language modelling head on top. Only usable for text.", + MOSHI_START_DOCSTRING, +) +class MoshiForCausalLM(MoshiPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"] + + # Copied from transformers.models.gemma.modeling_gemma.GemmaForCausalLM.__init__ with Gemma->Moshi + def __init__(self, config): + super().__init__(config) + self.model = MoshiModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MOSHI_DECODER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=MoshiCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, MoshiCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MoshiForCausalLM + + >>> model = MoshiForCausalLM.from_pretrained("kmhf/hf-moshiko") + >>> tokenizer = AutoTokenizer.from_pretrained("kmhf/hf-moshiko") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if labels is None and not is_torchdynamo_compiling(): + logger.warning_once( + "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" + ) + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + + loss = None + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = ( + logits, + hidden_states, + ) + outputs[1:] + return (loss,) + output if loss is not None else output + + return MoshiCausalLMOutputWithPast( + loss=loss, + logits=logits, + last_hidden_state=hidden_states, # Ignore copy + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + "The original Moshi model with an audio encoder, a Moshi depth decoder and a Moshi decoder, " + "for speech-to-speech.", + MOSHI_START_DOCSTRING, +) +class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["decoder.model.embed_tokens.weight", "decoder.lm_head.weight"] + config_class = MoshiConfig + main_input_name = "input_ids" + supports_gradient_checkpointing = True + _supports_flash_attn_2 = True + _supports_sdpa = True + + def __init__(self, config: MoshiConfig): + super().__init__(config) + # We have 2 * num_codebooks audio embedding layers because we have the user input channel and the model output channel. + self.embed_tokens = nn.ModuleList( + [nn.Embedding(config.audio_vocab_size + 1, config.hidden_size) for _ in range(2 * config.num_codebooks)] + ) + self.audio_encoder = AutoModel.from_config( + config.audio_encoder_config, attn_implementation=config._attn_implementation + ) + self.decoder = MoshiForCausalLM(config) + + config.depth_decoder_config._attn_implementation_internal = config._attn_implementation + self.depth_decoder = MoshiDepthDecoder(config.depth_decoder_config) + + self.num_codebooks = config.num_codebooks + self.post_init() + + def get_audio_encoder(self): + return self.audio_encoder + + def get_depth_decoder(self): + return self.depth_decoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(MOSHI_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + user_input_values: Optional[torch.FloatTensor] = None, + user_audio_codes: Optional[torch.Tensor] = None, + moshi_input_values: Optional[torch.FloatTensor] = None, + moshi_audio_codes: Optional[torch.Tensor] = None, + past_key_values: Tuple[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + text_labels: Optional[torch.LongTensor] = None, + audio_labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, Seq2SeqLMOutput]: + r""" + Returns: + + Examples: + ```python + >>> from transformers import MoshiForConditionalGeneration + >>> import torch + + >>> model = MoshiForConditionalGeneration.from_pretrained("kmhf/hf-moshiko") + >>> inputs = moshi.get_unconditional_inputs() + + >>> logits = model(**inputs, ).logits + >>> logits.shape # (bsz, seq_len, text_vocab_size) + torch.Size([1, 1, 32000]) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + kwargs_audio_encoder = { + argument[len("audio_encoder_")]: value + for argument, value in kwargs.items() + if argument.startswith("audio_encoder_") + } + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + kwargs_depth_decoder = { + argument[len("depth_decoder_") :]: value + for argument, value in kwargs.items() + if argument.startswith("depth_decoder_") + } + + # If inputs_embeds is provided, it has the priority over input_ids and audio_codes, which won't be used + if inputs_embeds is None: + if user_input_values is not None and user_audio_codes is None: + user_audio_codes = self.audio_encoder.encode( + user_input_values, num_quantizers=self.num_codebooks, **kwargs_audio_encoder + )[0] + + if moshi_input_values is not None and moshi_audio_codes is None: + moshi_audio_codes = self.audio_encoder.encode( + moshi_input_values, num_quantizers=self.num_codebooks, **kwargs_audio_encoder + )[0] + + audio_codes = torch.cat([moshi_audio_codes, user_audio_codes], dim=1) + + if input_ids is None and audio_codes is None: + raise ValueError( + "You must provide at least one of `input_ids`, `inputs_embeds`, `input_values` and `audio_codes`." + ) + + if input_ids is not None: + inputs_embeds = self.decoder.model.embed_tokens(input_ids) + + if audio_codes is not None: + audio_inputs_embeds = sum( + [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])] + ) + inputs_embeds = ( + audio_inputs_embeds + if inputs_embeds is None + else audio_inputs_embeds + inputs_embeds.to(audio_inputs_embeds.device) + ) + + # Decode + decoder_outputs = self.decoder( + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + past_key_values=past_key_values, + return_dict=True, + labels=text_labels, + **kwargs_decoder, + ) + + decoder_last_hidden_state = decoder_outputs.last_hidden_state + + depth_decoder_outputs = None + final_loss = decoder_outputs.loss + if text_labels is not None and audio_labels is not None: + # To use depth decoder forward here, we actually need oracle input ids since we're supposed to pass the true input ids + + audio_labels = self.build_delay_pattern_mask( + audio_labels, + bos_token_id=self.config.audio_vocab_size, + pad_token_id=self.config.audio_vocab_size, + max_length=audio_labels.shape[-1] + 1, + )[0] + + # (batch_size, sequence_length) -> (batch_size * sequence_length, 1) + text_labels = text_labels.view(-1, 1) + + # (batch_size, num_codebooks, sequence_length) -> (batch_size * sequence_length, num_codebooks) + audio_labels = audio_labels.transpose(1, 2).reshape(-1, audio_labels.shape[1]) + + depth_input_ids = torch.cat([text_labels, audio_labels], dim=1) + # keep the last codebook out of input_ids + depth_input_ids = depth_input_ids[:, :-1] + + # (batch_size, sequence_length, dim) -> (batch_size * sequence_length, 1, dim) + decoder_last_hidden_state = decoder_last_hidden_state.view(-1, 1, decoder_last_hidden_state.shape[-1]) + + depth_decoder_outputs = self.depth_decoder( + last_hidden_state=decoder_last_hidden_state, + input_ids=depth_input_ids, + attention_mask=attention_mask, + labels=audio_labels, + **kwargs_depth_decoder, + ) + + final_loss += depth_decoder_outputs.loss + + if not return_dict: + outputs = decoder_outputs.to_tuple() + if depth_decoder_outputs is not None: + outputs += depth_decoder_outputs.to_tuple() + return outputs + + return MoshiConditionalGenerationOutputWithPast( + loss=decoder_outputs.loss, + logits=decoder_outputs.logits, + last_hidden_state=decoder_last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + depth_loss=None if depth_decoder_outputs is None else depth_decoder_outputs.loss, + audio_logits=None if depth_decoder_outputs is None else depth_decoder_outputs.logits, + depth_past_key_values=None if decoder_outputs is None else decoder_outputs.past_key_values, + depth_hidden_states=None if decoder_outputs is None else decoder_outputs.hidden_states, + depth_attentions=None if decoder_outputs is None else decoder_outputs.attentions, + ) + + def _prepare_inputs_embeds_for_generation( + self, + input_ids: Optional[torch.LongTensor] = None, + user_input_values: Optional[torch.FloatTensor] = None, + user_audio_codes: Optional[torch.Tensor] = None, + moshi_input_values: Optional[torch.FloatTensor] = None, + moshi_audio_codes: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + apply_delay_pattern_mask: bool = False, + concat_unconditional_inputs: bool = False, + ): + user_delay_pattern_mask = None + moshi_delay_pattern_mask = None + + if ( + inputs_embeds is None + and input_ids is None + and user_input_values is None + and user_audio_codes is None + and moshi_input_values is None + and moshi_audio_codes is None + ): + raise ValueError( + "You must provide at least one of `input_ids`, `user_input_values`, `moshi_input_values`, `user_audio_codes`, `moshi_audio_codes` or `inputs_embeds`." + ) + + # in case inputs_embeds is passed, we might still need to create delay pattern masks + if inputs_embeds is None or apply_delay_pattern_mask: + if user_input_values is not None and user_audio_codes is None: + user_audio_codes = self.audio_encoder.encode(user_input_values, num_quantizers=self.num_codebooks)[0] + + if moshi_input_values is not None and moshi_audio_codes is None: + moshi_audio_codes = self.audio_encoder.encode(moshi_input_values, num_quantizers=self.num_codebooks)[0] + + if inputs_embeds is None and concat_unconditional_inputs: + unconditional_inputs = self.get_unconditional_inputs(num_samples=user_audio_codes.shape[0]) + moshi_audio_codes = torch.cat([unconditional_inputs.moshi_audio_codes, moshi_audio_codes], dim=2) + user_audio_codes = torch.cat([unconditional_inputs.user_audio_codes, user_audio_codes], dim=2) + input_ids = torch.cat([unconditional_inputs.input_ids, input_ids], dim=1) + if attention_mask is not None: + attention_mask = torch.cat([unconditional_inputs.attention_mask, attention_mask], dim=1) + + if inputs_embeds is None or apply_delay_pattern_mask: + if apply_delay_pattern_mask and user_audio_codes is not None: + user_audio_codes, user_delay_pattern_mask = self.build_delay_pattern_mask( + user_audio_codes, + bos_token_id=self.config.audio_vocab_size, + pad_token_id=self.config.audio_vocab_size, + max_length=generation_config.max_length, + ) + + if apply_delay_pattern_mask and moshi_audio_codes is not None: + moshi_audio_codes, moshi_delay_pattern_mask = self.build_delay_pattern_mask( + moshi_audio_codes, + bos_token_id=self.config.audio_vocab_size, + pad_token_id=self.config.audio_vocab_size, + max_length=generation_config.max_length, + ) + + # If inputs_embeds is provided, it has the priority over input_ids and audio_codes, which won't be used + if inputs_embeds is None: + audio_inputs_embeds = None + if user_audio_codes is not None and moshi_audio_codes is not None: + audio_codes = torch.cat([moshi_audio_codes, user_audio_codes], dim=1) + audio_inputs_embeds = sum( + [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])] + ) + elif moshi_audio_codes is not None: + audio_codes = moshi_audio_codes + audio_inputs_embeds = sum( + [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])] + ) + elif user_audio_codes is not None: + audio_codes = user_audio_codes + audio_inputs_embeds = sum( + [ + self.embed_tokens[codebook](audio_codes[:, codebook + self.num_codebooks]) + for codebook in range(audio_codes.shape[1]) + ] + ) + + if input_ids is not None: + inputs_embeds = self.decoder.model.embed_tokens(input_ids) + + if audio_inputs_embeds is not None: + inputs_embeds = ( + audio_inputs_embeds + if inputs_embeds is None + else audio_inputs_embeds + inputs_embeds.to(audio_inputs_embeds.device) + ) + + return ( + inputs_embeds, + input_ids, + user_audio_codes, + moshi_audio_codes, + user_delay_pattern_mask, + moshi_delay_pattern_mask, + attention_mask, + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.LongTensor] = None, + user_input_values: Optional[torch.FloatTensor] = None, + user_audio_codes: Optional[torch.Tensor] = None, + moshi_input_values: Optional[torch.FloatTensor] = None, + moshi_audio_codes: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + return_audio_waveforms: Optional[bool] = True, + return_audio_codes: Optional[bool] = None, + concat_unconditional_inputs: Optional[bool] = True, + **kwargs, + ) -> torch.LongTensor: + """ + Generates sequences of text token ids and audio tokens ids. + + Parameters: + input_ids (`torch.Tensor `of shape `(batch_size, sequence_length), *optional*): + The sequence used as a text prompt for the generation. + user_input_values (`torch.Tensor `of shape `(batch_size, 1, audio_sequence_length), *optional*): + The audio waveforms used as audio user prompt for the generation. + user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio user prompt for the generation. Has priority over `user_input_values` and represents the audio "tokens" of `user_input_values` once passed through the audio encoder. + moshi_input_values (`torch.Tensor `of shape `(batch_size, 1, audio_sequence_length), *optional*): + The audio waveforms used as audio Moshi prompt for the generation. + moshi_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*): + The audio codes used as audio Moshi prompt for the generation. Has priority over `moshi_input_values` and represents the audio "tokens" of `moshi_input_values` once passed through the audio encoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` and the audio inputs you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert the inputs into associated vectors than the + model's internal embedding lookup matrix. + return_audio_waveforms (`bool`, *optional*, defaults to `True`): + If `False`, won't generate the audio waveforms. + return_audio_codes (`bool`, *optional*): + If `True`, will also returns the generated audio codes, i.e the intermediate audio "tokens" which transforms to `audio_sequences` once passed through the audio decoder. + concat_unconditional_inputs (`bool`, *optional*, defaults to `True`): + If `False`, won't concatenate initial audio and text tokens. + kwargs (`Dict[str, Any]`, *optional*): + Remaining dictionary of keyword arguments that are passed to the `generate` method. Refers to the + original [`generate` docstrings](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate) + for more information on how to use them. + Note that keywords with a *depth_* prefix will be input for the `generate` method of the + depth decoder. Otherwise, the latter will use its default generation config. + Return: + [`MoshiConditionalGenerationGenerateOutput`] + """ + # multiple generate -> need to create/update device map + if hasattr(self, "hf_device_map") and not hasattr(self.depth_decoder, "hf_device_map"): + self.depth_decoder.hf_device_map = {} + if "" in self.hf_device_map: + self.depth_decoder.hf_device_map = self.hf_device_map + else: + main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0] + self.depth_decoder.hf_device_map = { + key[len("depth_decoder") :]: main_device if value in ["cpu", "disk"] else value + for key, value in self.hf_device_map.items() + if key.startswith("depth_decoder") + } + # need to remove depth_decoder from the top device_map so that we assign correctly the device for each layer idx in the cache + self.hf_device_map = { + key: value for key, value in self.hf_device_map.items() if not key.startswith("depth_decoder") + } + # retrieve depth decoder kwargs + depth_decoder_kwargs_keys = {argument for argument in kwargs if argument.startswith("depth_decoder_")} + kwargs_depth_decoder = { + argument[len("depth_decoder_") :]: kwargs.pop(argument) for argument in depth_decoder_kwargs_keys + } + + # needs to prepare generation config, even though it'll be done again in `generate` + generation_config, kwargs = self._prepare_generation_config(kwargs.pop("generation_config", None), **kwargs) + + input_ids, user_audio_codes, moshi_audio_codes, concat_unconditional_inputs = ( + self._check_and_maybe_initalize_inputs( + input_ids=input_ids, + user_input_values=user_input_values, + user_audio_codes=user_audio_codes, + moshi_input_values=moshi_input_values, + moshi_audio_codes=moshi_audio_codes, + inputs_embeds=inputs_embeds, + concat_unconditional_inputs=concat_unconditional_inputs, + ) + ) + + inputs = inputs_embeds if input_ids is None else input_ids + + input_ids_length = inputs.shape[-1] + 1 if concat_unconditional_inputs else inputs.shape[-1] + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None + generation_config = self._prepare_generated_length( + generation_config=generation_config, + has_default_max_length=has_default_max_length, + has_default_min_length=has_default_min_length, + model_input_name="inputs_embeds" if input_ids is None else "input_ids", + inputs_tensor=inputs, + input_ids_length=input_ids_length, + ) + + # retrieve depth decoder generation config if it exists + if hasattr(generation_config, "depth_decoder_config"): + depth_decoder_generation_config = generation_config.depth_decoder_config + else: + # we need to control the number of tokens generated by the depth decoder + depth_decoder_generation_config = { + "min_length": self.num_codebooks + 1, + "max_length": self.num_codebooks + 1, + "cache_implementation": "sliding_window", + } + # update kwargs_depth_decoder: kwargs_depth_decoder have priority over depth_decoder_generation_config + depth_decoder_generation_config.update(kwargs_depth_decoder) + kwargs_depth_decoder = depth_decoder_generation_config + + attention_mask = kwargs.pop("attention_mask", None) + ( + inputs_embeds, + input_ids, + user_audio_codes, + moshi_audio_codes, + user_delay_pattern_mask, + moshi_delay_pattern_mask, + attention_mask, + ) = self._prepare_inputs_embeds_for_generation( + input_ids=input_ids, + user_input_values=user_input_values, + user_audio_codes=user_audio_codes, + moshi_input_values=moshi_input_values, + moshi_audio_codes=moshi_audio_codes, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + generation_config=generation_config, + apply_delay_pattern_mask=True, + concat_unconditional_inputs=concat_unconditional_inputs, + ) + + # create blank user inputs - moshi needs a constant stream of user inputs + blank_input_values = torch.zeros( + (inputs_embeds.shape[0], 1, int(self.config.sampling_rate / self.config.audio_encoder_config.frame_rate)), + dtype=self.dtype, + device=self.device, + ) + blank_user_audio_codes = self.audio_encoder.encode(blank_input_values, num_quantizers=self.num_codebooks)[0] + + # set delay pattern mask for the rest of the generation + kwargs["user_delay_pattern_mask"] = ( + user_delay_pattern_mask if user_delay_pattern_mask is not None else kwargs.get("user_delay_pattern_mask") + ) + kwargs["moshi_delay_pattern_mask"] = ( + moshi_delay_pattern_mask + if moshi_delay_pattern_mask is not None + else kwargs.get("moshi_delay_pattern_mask") + ) + + self.generated_audio_codes = torch.repeat_interleave( + moshi_audio_codes, max(generation_config.num_beams, generation_config.num_return_sequences), dim=0 + ) + + return_dict_in_generate = generation_config.num_beams > 1 or generation_config.return_dict_in_generate + output_scores = generation_config.num_beams > 1 or generation_config.output_scores + outputs = super().generate( + inputs_embeds=inputs_embeds, + input_ids=input_ids, + generation_config=generation_config, + blank_user_audio_codes=blank_user_audio_codes, + kwargs_depth_decoder=kwargs_depth_decoder, + return_dict_in_generate=return_dict_in_generate, + output_scores=output_scores, + attention_mask=attention_mask, + **kwargs, + ) + + if not return_audio_waveforms and not return_audio_codes: + if return_dict_in_generate and not generation_config.return_dict_in_generate: + return outputs.sequences + return outputs + + # check if outputs is a dict or tokens + if not return_dict_in_generate: + output_text_ids = outputs + else: + output_text_ids = outputs.sequences + + if generation_config.num_return_sequences > 1: + moshi_delay_pattern_mask = torch.repeat_interleave( + moshi_delay_pattern_mask, generation_config.num_return_sequences, dim=0 + ) + + if generation_config.num_beams > 1: + # we need to reorganize self.last_hidden_states and generated audio codes according to the beam_indices + + # Beam indices are of shape `input_length + number_generated_tokens` but actually starts + # indexing indices at index 0 instead of index `input_length-1`. + # We thus discard the last `input_length` indices that are never used. + beam_indices = outputs.beam_indices[:, : -moshi_audio_codes.shape[-1]] + + generated_audio_codes = self.generated_audio_codes[:, :, moshi_audio_codes.shape[-1] :] + + # we've generated audio tokens `number_generated_tokens-1` times, so we use the corresponding beam indices to + # retrieve the right audio tokens + expanded_beam_indices = beam_indices[:, :-1].unsqueeze(1).expand(-1, self.num_codebooks, -1) + generated_audio_codes = torch.gather(generated_audio_codes, dim=0, index=expanded_beam_indices) + + # now, rebuild generated audio codes, this time with the right beam tracking + moshi_audio_codes = torch.repeat_interleave( + moshi_audio_codes, generation_config.num_return_sequences, dim=0 + ) + self.generated_audio_codes = torch.cat((moshi_audio_codes, generated_audio_codes), dim=2) + + # use the last beam indice to retrieve the right self.last_hidden_state + self.last_hidden_state = torch.index_select(self.last_hidden_state, dim=0, index=beam_indices[:, -1]) + + # we need to make a last generation with the latest generated tokens + last_hidden_state = self.last_hidden_state.view(-1, 1, self.last_hidden_state.shape[-1]) + + last_generated_audio_codes = self.depth_decoder.generate( + last_hidden_state=last_hidden_state, + input_ids=output_text_ids[:, -1:].view(-1, 1), + **kwargs_depth_decoder, + ) + + last_generated_audio_codes = last_generated_audio_codes[:, 1:].unsqueeze(2) + + self.generated_audio_codes = torch.cat([self.generated_audio_codes, last_generated_audio_codes], dim=2) + + # apply the pattern mask to the final audio ids + output_audio_codes = self.apply_delay_pattern_mask(self.generated_audio_codes, moshi_delay_pattern_mask) + + # revert the pattern delay mask by filtering the pad token id and bos token ids + mask = moshi_delay_pattern_mask != self.config.audio_vocab_size + + output_audio_codes = output_audio_codes[mask].reshape(mask.shape[0], self.num_codebooks, -1) + + output_values = None + if return_audio_waveforms: + output_values = self.audio_encoder.decode( + output_audio_codes, + ).audio_values + + output_audio_codes = output_audio_codes if return_audio_codes else None + + if generation_config.return_dict_in_generate: + return MoshiConditionalGenerationGenerateOutput( + audio_sequences=output_values, audio_codes=output_audio_codes, **outputs + ) + + return MoshiConditionalGenerationGenerateOutput( + audio_sequences=output_values, sequences=output_text_ids, audio_codes=output_audio_codes + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + num_logits_to_keep=None, + user_delay_pattern_mask=None, + moshi_delay_pattern_mask=None, + kwargs_depth_decoder=None, + blank_user_audio_codes: Optional[torch.FloatTensor] = None, + **kwargs, + ): + # 1. Do usual operations done on LLMs like Gemma - because we pre-processed inputs, the first pass always has inputs_embeds + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + if past_key_values is not None: + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. + position_ids = position_ids.clone(memory_format=torch.contiguous_format) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # The clone here is for the same reason as for `position_ids`. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + + if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: + if model_inputs["inputs_embeds"] is not None: + batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape + device = model_inputs["inputs_embeds"].device + else: + batch_size, sequence_length = model_inputs["input_ids"].shape + device = model_inputs["input_ids"].device + + dtype = self.decoder.dtype + + attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.max_cache_len, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=batch_size, + config=self.config, + past_key_values=past_key_values, + ) + + if num_logits_to_keep is not None: + model_inputs["num_logits_to_keep"] = num_logits_to_keep + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + + # 2. Now that everything is prepared, generate audio_codes using the depth decoder + + # we want to do it after a first token has been generated + if model_inputs["input_ids"] is not None: + last_hidden_state = kwargs.get("last_hidden_state") + # (batch_size, sequence_length, dim) -> (batch_size * sequence_length, 1, dim) + last_hidden_state = last_hidden_state.view(-1, 1, last_hidden_state.shape[-1]) + + input_ids = model_inputs.pop("input_ids") + + generated_audio_codes = self.depth_decoder.generate( + last_hidden_state=last_hidden_state, + input_ids=input_ids.view(-1, 1), + **kwargs_depth_decoder, + ) + + # the first tokens are text tokens + generated_audio_codes = generated_audio_codes[:, 1:].unsqueeze(2) + + user_audio_codes = self.apply_delay_pattern_mask( + torch.cat( + [self.generated_audio_codes, blank_user_audio_codes.to(self.generated_audio_codes.device)], dim=2 + ), + user_delay_pattern_mask, + )[:, :, -1:] + self.generated_audio_codes = self.apply_delay_pattern_mask( + torch.cat([self.generated_audio_codes, generated_audio_codes], dim=2), moshi_delay_pattern_mask + ) + + inputs_embeds, _, _, _, _, _, _ = self._prepare_inputs_embeds_for_generation( + input_ids, moshi_audio_codes=self.generated_audio_codes[:, :, -1:], user_audio_codes=user_audio_codes + ) + + model_inputs["input_ids"] = None + model_inputs["inputs_embeds"] = inputs_embeds + + return model_inputs + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + num_new_tokens: int = 1, + ) -> Dict[str, Any]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder, num_new_tokens + ) + + # update last_hidden_state that'll be used in the depth decoder + model_kwargs["last_hidden_state"] = outputs.get("last_hidden_state")[:, -1:] + + # dirty, but we need to make a last depth_decoder.generate + self.last_hidden_state = outputs.get("last_hidden_state")[:, -1:] + return model_kwargs + + def get_input_embeddings(self): + return self.decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.decoder.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.decoder.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.decoder.set_output_embeddings(new_embeddings) + + def freeze_audio_encoder(self): + """ + Freeze the audio encoder weights. + """ + for param in self.audio_encoder.parameters(): + param.requires_grad = False + self.audio_encoder._requires_grad = False + + def freeze_depth_decoder(self): + """ + Freeze the depth encoder weights. + """ + for param in self.depth_decoder.parameters(): + param.requires_grad = False + self.depth_decoder._requires_grad = False + + @staticmethod + # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForCausalLM.apply_delay_pattern_mask + def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask): + """Apply a delay pattern mask to the decoder input ids, only preserving predictions where + the mask is set to -1, and otherwise setting to the value detailed in the mask.""" + seq_len = input_ids.shape[-1] + decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len] + input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask) + return input_ids + + def build_delay_pattern_mask( + self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: int = None + ): + """Build a delayed pattern mask to the input_ids. Each codebook, except the first one, is offset by + one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there + are 4 codebooks and a max sequence length of 6, we have the delayed pattern mask of shape `(codebooks, + seq_len)`: + - [-1, -1, -1, -1, -1, P] + - [ B, -1, -1, -1, -1, -1] + - [ B, -1, -1, -1, -1, -1] + - [ B, -1, -1, -1, -1, -1] + where B is the begining-of-sentence token, P is the special padding token id and -1 indicates that the token is valid for prediction. If we include + a prompt (input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the + mask is set to the value in the prompt: + - [ a0, a1, -1, -1, -1, P] + - [ B, b0, b1, -1, -1, -1] + - [ B, c0, c1, -1, -1, -1] + - [ B, d0, d1, -1, -1, -1] + where a-d indicate the codebook channel and 0/1 indicates the temporality. Now, we only override the -1 + tokens in our prediction. + """ + bsz, num_codebooks, seq_len = input_ids.shape + + max_length = max_length if max_length is not None else self.generation_config.max_length + input_ids_shifted = ( + torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1 + ) + + # the first codebook channel is not shifted + seq_len_to_keep = min(seq_len, max_length - 1) + input_ids_shifted[:, 0, :seq_len_to_keep] = input_ids[:, 0, :seq_len_to_keep] + + # fill the shifted ids with the prompt entries + input_ids_shifted[:, 1:, 1 : seq_len_to_keep + 1] = input_ids[:, 1:, :seq_len_to_keep] + + # fill with BOS and PAD + input_ids_shifted[:, 1:, 0] = bos_token_id + input_ids_shifted[:, 0, -1] = pad_token_id + + # construct a pattern mask that indicates the positions of BOS and PAD tokens for each codebook + pattern_mask = input_ids_shifted + + input_ids = input_ids_shifted[..., :seq_len_to_keep] + return input_ids, pattern_mask + + def get_unconditional_inputs(self, num_samples=1): + """ + Helper function to get null inputs for unconditional generation, enabling the model to be used without the + feature extractor or tokenizer. + + Args: + num_samples (int, *optional*): + Number of audio samples to unconditionally generate. + max_new_tokens (int, *optional*): + Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of + longer inference (since more audio tokens need to be generated per sample). + + Example: + ```python + >>> from transformers import MoshiForConditionalGeneration + + >>> model = MoshiForConditionalGeneration.from_pretrained("kmhf/hf-moshiko-pytorch-bf16") + + >>> # get the unconditional (or 'null') inputs for the model + >>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1) + >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256) + ```""" + + input_ids = torch.ones((num_samples, 1), device=self.device, dtype=torch.int64) * self.config.vocab_size + user_audio_codes = ( + torch.ones((num_samples, self.num_codebooks, 1), device=self.device, dtype=torch.int64) + * self.config.audio_vocab_size + ) + moshi_audio_codes = ( + torch.ones((num_samples, self.num_codebooks, 1), device=self.device, dtype=torch.int64) + * self.config.audio_vocab_size + ) + attention_mask = torch.ones((num_samples, 1), device=self.device, dtype=torch.long) + + return MoshiUnconditionalInput( + input_ids=input_ids, + user_audio_codes=user_audio_codes, + moshi_audio_codes=moshi_audio_codes, + attention_mask=attention_mask, + ) + + def _check_and_maybe_initalize_inputs( + self, + input_ids=None, + user_input_values=None, + user_audio_codes=None, + moshi_input_values=None, + moshi_audio_codes=None, + inputs_embeds=None, + concat_unconditional_inputs=None, + ): + inputs = input_ids if inputs_embeds is None else inputs_embeds + user_input = user_audio_codes if user_input_values is None else user_input_values + moshi_input = moshi_audio_codes if moshi_input_values is None else moshi_input_values + + one_input_has_been_passed = (user_input is not None) or (moshi_input is not None) or (inputs is not None) + + # concat_unconditional_inputs will be False if inputs_embeds is used + concat_unconditional_inputs = concat_unconditional_inputs and not ( + inputs_embeds is not None and input_ids is None + ) + + # if one or two of the three required inputs have been passed, throws an error + if one_input_has_been_passed and (user_input is None): + raise ValueError( + "No user audio inputs have been passed alongside the other inputs. Make sure either `user_input_values` or `user_audio_codes` is passed or use `MoshiForConditionalGeneration.get_unconditional_inputs`. Check the `MoshiForConditionalGeneration` docstrings for more information." + ) + elif one_input_has_been_passed and (moshi_input is None): + raise ValueError( + "No Moshi audio inputs have been passed alongside the other inputs. Make sure either `moshi_input_values` or `moshi_audio_codes` is passed or use `MoshiForConditionalGeneration.get_unconditional_inputs`. Check the `MoshiForConditionalGeneration` docstrings for more information." + ) + elif one_input_has_been_passed and (inputs is None): + raise ValueError( + "No `input_ids` or `inputs_embeds` have been passed alongside the other inputs. Make sure `input_ids` is passed or use `MoshiForConditionalGeneration.get_unconditional_inputs`. Check the `MoshiForConditionalGeneration` docstrings for more information." + ) + elif not one_input_has_been_passed: + # if no inputs have been passed, use default values + unconditional_inputs = self.get_unconditional_inputs() + input_ids = unconditional_inputs.input_ids + user_audio_codes = unconditional_inputs.user_audio_codes + moshi_audio_codes = unconditional_inputs.moshi_audio_codes + + # in that case, no need to concat unconditional inputs + concat_unconditional_inputs = False + else: + # check if same sequence length + user_seq_length = user_input.shape[-1] + moshi_seq_length = moshi_input.shape[-1] + tokens_seq_length = inputs.shape[1] + + ratio = self.config.audio_encoder_config.frame_rate / self.config.sampling_rate + moshi_seq_length = math.ceil(moshi_seq_length * ratio) if moshi_audio_codes is None else moshi_seq_length + user_seq_length = math.ceil(user_seq_length * ratio) if user_audio_codes is None else user_seq_length + + if tokens_seq_length != moshi_seq_length or tokens_seq_length != user_seq_length: + raise ValueError( + "At least one of the 3 inputs of `MoshiForConditionalGeneration` doesn't have the same sequence length as the others." + "Make sure that they all have the same sequence length. Check the `MoshiForConditionalGeneration` docstrings for more information." + ) + + return input_ids, user_audio_codes, moshi_audio_codes, concat_unconditional_inputs + + @staticmethod + def _reorder_cache( + past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past_key_values + ) + + +__all__ = ["MoshiForCausalLM", "MoshiForConditionalGeneration", "MoshiModel", "MoshiPreTrainedModel"] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 4ca25bc7914a1c..d7570c57c62f36 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -6219,6 +6219,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MoshiForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MoshiForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MoshiModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MoshiPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MPNetForMaskedLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index a1bc526566726f..5165e43c099416 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1098,6 +1098,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): "bigbirdpegasus", "led", "mega", + "moshi", "speech2text", "git", "prophetnet", @@ -1172,6 +1173,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): "bigbirdpegasus", "led", "mega", + "moshi", "speech2text", "git", "prophetnet", @@ -1285,6 +1287,7 @@ def test_assisted_decoding_sample(self): "bigbirdpegasus", "led", "mega", + "moshi", "speech2text", "git", "prophetnet", diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index ab6184ce2bbed8..074dceae155214 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -790,6 +790,7 @@ def test_integration_using_cache_decode(self): } librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + model_id = "kyutai/mimi" model = MimiModel.from_pretrained(model_id, use_cache=True).to(torch_device) @@ -840,6 +841,7 @@ def test_integration(self): "32": 1803071, } librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + model_id = "kyutai/mimi" processor = AutoFeatureExtractor.from_pretrained(model_id) diff --git a/tests/models/moshi/__init__.py b/tests/models/moshi/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py new file mode 100644 index 00000000000000..b299b414d609b1 --- /dev/null +++ b/tests/models/moshi/test_modeling_moshi.py @@ -0,0 +1,1126 @@ +# coding=utf-8 +# Copyright 2024, The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Moshi model.""" + +import copy +import tempfile +import unittest + +import numpy as np +import pytest +from datasets import Audio, load_dataset +from parameterized import parameterized + +from transformers import ( + MoshiConfig, + PretrainedConfig, +) +from transformers.integrations.deepspeed import ( + is_deepspeed_available, + is_deepspeed_zero3_enabled, +) +from transformers.testing_utils import ( + is_flaky, + is_torch_available, + require_torch, + require_torch_fp16, + require_torch_sdpa, + slow, + torch_device, +) +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_deepspeed_available(): + import deepspeed + +if is_torch_available(): + import torch + + from transformers import ( + AutoFeatureExtractor, + AutoTokenizer, + MoshiForCausalLM, + MoshiForConditionalGeneration, + MoshiModel, + ) + + +def _config_zero_init(config): + configs_no_init = copy.deepcopy(config) + for key in configs_no_init.__dict__.keys(): + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: + setattr(configs_no_init, key, 1e-10) + if isinstance(getattr(configs_no_init, key, None), PretrainedConfig): + no_init_subconfig = _config_zero_init(getattr(configs_no_init, key)) + setattr(configs_no_init, key, no_init_subconfig) + return configs_no_init + + +class MoshiDecoderTester: + def __init__( + self, + parent, + batch_size=4, # need batch_size != num_hidden_layers + seq_length=7, + is_training=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="silu", + rms_norm_eps=0.001, + ffn_dim=32, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=100, + pad_token_id=25, + num_codebooks=4, + audio_encoder_type="mimi", + attn_implementation="eager", + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.rms_norm_eps = rms_norm_eps + self.ffn_dim = ffn_dim + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.pad_token_id = pad_token_id + self.num_codebooks = num_codebooks + self.audio_encoder_type = audio_encoder_type + self.attn_implementation = attn_implementation + + def prepare_config_and_inputs(self, batch_size=None): + batch_size = self.batch_size if batch_size is None else batch_size + input_ids = ids_tensor([batch_size, self.seq_length], self.vocab_size) + config = self.get_config() + + attention_mask = input_ids.ne(self.pad_token_id) + + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def get_config(self): + config = MoshiConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + d_ff=self.intermediate_size, + num_codebooks=self.num_codebooks, + rms_norm_eps=self.rms_norm_eps, + tie_word_embeddings=False, + pad_token_id=self.pad_token_id, + ffn_dim=self.ffn_dim, + audio_encoder_config={"model_type": self.audio_encoder_type}, + attn_implementation=self.attn_implementation, + ) + return config + + def prepare_config_and_inputs_for_common(self, batch_size=None): + config, inputs_dict = self.prepare_config_and_inputs(batch_size) + return config, inputs_dict + + +@require_torch +class MoshiDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (MoshiModel, MoshiForCausalLM) if is_torch_available() else () + all_generative_model_classes = ( + (MoshiForCausalLM,) if is_torch_available() else () + ) # we don't want to run all the generation tests, only a specific subset + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + pipeline_model_mapping = ( + { + "feature-extraction": MoshiModel, + "text-generation": MoshiForCausalLM, + } + if is_torch_available() + else {} + ) + + def setUp(self): + self.model_tester = MoshiDecoderTester(self) + self.config_tester = ConfigTester( + self, + config_class=MoshiConfig, + hidden_size=16, + audio_encoder_config={"model_type": self.model_tester.audio_encoder_type}, + ) + + @unittest.skip(reason="The MoshiModel does not have support dynamic compile yet") + def test_sdpa_can_compile_dynamic(self): + pass + + def _get_input_ids_and_config(self, batch_size=1): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(batch_size) + input_ids = inputs_dict.pop("input_ids").to(torch_device) + attention_mask = inputs_dict.pop("attention_mask").to(torch_device) + + return config, input_ids, attention_mask, inputs_dict + + def _get_logits_processor_kwargs(self, do_sample=False, config=None): + logits_processor_kwargs = {} + return logits_processor_kwargs + + @require_torch_sdpa + @slow + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + self.skipTest(reason="Moshi has no strict equivalence between two modes, skipping this test.") + + # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings + def test_resize_tokens_embeddings(self): + if not self.test_resize_embeddings: + self.skipTest(reason="test_resize_embeddings is set to `False`") + + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.Init(): + model = model_class(config) + else: + model = model_class(config) + model.to(torch_device) + + model_embed_pre_resize = model.get_input_embeddings() + type_model_embed_pre_resize = type(model_embed_pre_resize) + + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.get_text_config().vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + new_model_vocab_size = model.config.get_text_config().vocab_size + self.assertEqual(new_model_vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check to make sure the type of embeddings returned post resizing is same as type of input + type_model_embed_post_resize = type(model_embed) + self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize) + # Check that added embeddings mean is close to the old embeddings mean + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None): + old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0) + new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0) + else: + old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0) + new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0) + torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + if not is_deepspeed_zero3_enabled(): + # A distriputed launcher is needed for the forward pass when deepspeed is enabled + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + new_model_vocab_size = model.config.get_text_config().vocab_size + self.assertEqual(new_model_vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1) + + # make sure that decoder_input_ids are resized as well + if not is_deepspeed_zero3_enabled(): + # A distriputed launcher is needed for the forward pass when deepspeed is enabled + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + del model + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.Init(): + model = model_class(config) + else: + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.get_text_config().vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + new_model_vocab_size = model.config.get_text_config().vocab_size + self.assertTrue(new_model_vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + new_model_vocab_size = model.config.get_text_config().vocab_size + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], new_model_vocab_size) + self.assertTrue(new_model_vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # Test when `vocab_size` is smaller than `hidden_size`. + del model + config.vocab_size = 4 + config.pad_token_id = 4 # Ignore copy + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.Init(): + model = model_class(config) + else: + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.get_text_config().vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + new_model_vocab_size = model.config.get_text_config().vocab_size + self.assertEqual(new_model_vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check to make sure the type of embeddings returned post resizing is same as type of input + type_model_embed_post_resize = type(model_embed) + self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize) + # Check that added embeddings mean is close to the old embeddings mean + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.GatheredParameters(model_embed.weight, modifier_rank=None): + old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0) + new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0) + else: + old_embeddings_mean = torch.mean(model_embed.weight.data[:-10, :], axis=0) + new_embeddings_mean = torch.mean(model_embed.weight.data[-10:, :], axis=0) + torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1) + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_cpu_offload(self): + pass + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_disk_offload_bin(self): + pass + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_disk_offload_safetensors(self): + pass + + @is_flaky(max_attempts=5, description="flaky on some models.") + def test_save_load(self): + super().test_save_load() + + +class MoshiTester: + def __init__( + self, + parent, + batch_size=4, # need batch_size != num_hidden_layers + seq_length=7, + is_training=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="silu", + rms_norm_eps=0.001, + ffn_dim=32, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=100, + pad_token_id=25, + bos_token_id=25, + num_codebooks=4, + audio_encoder_type="mimi", + attn_implementation="eager", + depth_hidden_size=16, + depth_num_hidden_layers=2, + depth_max_position_embeddings=5, + depth_num_attention_heads=8, + depth_ffn_dim=16, + depth_sliding_window=4, + mimi_intermediate_size=40, + mimi_hidden_size=32, + mimi_num_filters=8, + mimi_num_residual_layers=1, + mimi_upsampling_ratios=[8, 4], + mimi_codebook_size=64, + mimi_vector_quantization_hidden_dimension=64, + mimi_codebook_dim=64, + mimi_upsample_groups=32, + mimi_num_hidden_layers=2, + mimi_num_attention_heads=2, + mimi_num_key_value_heads=2, + mimi_sliding_window=3, + sampling_rate=800, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.rms_norm_eps = rms_norm_eps + self.ffn_dim = ffn_dim + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.num_codebooks = num_codebooks + self.attn_implementation = attn_implementation + self.depth_hidden_size = depth_hidden_size + self.depth_num_hidden_layers = depth_num_hidden_layers + self.depth_max_position_embeddings = depth_max_position_embeddings + self.depth_num_attention_heads = depth_num_attention_heads + self.depth_ffn_dim = depth_ffn_dim + self.depth_sliding_window = depth_sliding_window + + self.audio_encoder_type = audio_encoder_type + self.mimi_intermediate_size = mimi_intermediate_size + self.mimi_hidden_size = mimi_hidden_size + self.mimi_num_filters = mimi_num_filters + self.mimi_num_residual_layers = mimi_num_residual_layers + self.mimi_upsampling_ratios = mimi_upsampling_ratios + self.mimi_codebook_size = mimi_codebook_size + self.mimi_vector_quantization_hidden_dimension = mimi_vector_quantization_hidden_dimension + self.mimi_codebook_dim = mimi_codebook_dim + self.mimi_upsample_groups = mimi_upsample_groups + self.mimi_num_hidden_layers = mimi_num_hidden_layers + self.mimi_num_attention_heads = mimi_num_attention_heads + self.mimi_num_key_value_heads = mimi_num_key_value_heads + self.mimi_sliding_window = mimi_sliding_window + self.sampling_rate = sampling_rate + + self.num_hidden_states_types = 2 + + def prepare_config_and_inputs(self, batch_size=None): + batch_size = self.batch_size if batch_size is None else batch_size + + input_ids = ids_tensor([batch_size, self.seq_length], self.vocab_size) + + moshi_audio_codes = ids_tensor([batch_size, self.num_codebooks, self.seq_length], self.mimi_codebook_size) + user_audio_codes = ids_tensor([batch_size, self.num_codebooks, self.seq_length], self.mimi_codebook_size) + attention_mask = input_ids.ne(self.pad_token_id) + + config = self.get_config() + inputs_dict = { + "input_ids": input_ids, + "moshi_audio_codes": moshi_audio_codes, + "user_audio_codes": user_audio_codes, + "attention_mask": attention_mask, + } + return config, inputs_dict + + def get_config(self): + mimi_dict_config = { + "model_type": self.audio_encoder_type, + "audio_channels": 1, + "hidden_size": self.mimi_hidden_size, + "num_filters": self.mimi_num_filters, + "num_residual_layers": self.mimi_num_residual_layers, + "upsampling_ratios": self.mimi_upsampling_ratios, + "codebook_size": self.mimi_codebook_size, + "vector_quantization_hidden_dimension": self.mimi_vector_quantization_hidden_dimension, + "upsample_groups": self.mimi_upsample_groups, + "num_hidden_layers": self.mimi_num_hidden_layers, + "num_attention_heads": self.mimi_num_attention_heads, + "num_key_value_heads": self.mimi_num_key_value_heads, + "sliding_window": self.mimi_sliding_window, + "codebook_dim": self.mimi_codebook_dim, + "use_cache": False, + "sampling_rate": self.sampling_rate, + } + + depth_dict_config = { + "hidden_size": self.depth_hidden_size, + "num_hidden_layers": self.depth_num_hidden_layers, + "max_position_embeddings": self.depth_max_position_embeddings, + "num_attention_heads": self.depth_num_attention_heads, + "ffn_dim": self.depth_ffn_dim, + "sliding_window": self.depth_sliding_window, + } + + config = MoshiConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + d_ff=self.intermediate_size, + num_codebooks=self.num_codebooks, + rms_norm_eps=self.rms_norm_eps, + tie_word_embeddings=False, + pad_token_id=self.pad_token_id, + bos_token_id=self.bos_token_id, + ffn_dim=self.ffn_dim, + audio_encoder_config=mimi_dict_config, + depth_decoder_config=depth_dict_config, + attn_implementation=self.attn_implementation, + ) + return config + + def prepare_config_and_inputs_for_common(self, batch_size=None): + config, inputs_dict = self.prepare_config_and_inputs(batch_size) + return config, inputs_dict + + +@require_torch +class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (MoshiForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (MoshiForConditionalGeneration,) if is_torch_available() else () + test_pruning = False # training is not supported yet for Moshi + test_headmasking = False + test_resize_embeddings = False + test_torchscript = False + + def setUp(self): + self.model_tester = MoshiTester(self) + + # special case for labels + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + inputs_dict["text_labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), + dtype=torch.long, + device=torch_device, + ) + return inputs_dict + + def _get_input_ids_and_config(self, batch_size=2): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(batch_size) + input_ids = inputs_dict.pop("input_ids").to(torch_device) + attention_mask = inputs_dict.pop("attention_mask").to(torch_device) + + # Make sure we only return `input_ids`. + # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there. + # There are further tests to test that audio waveforms and codes are well generated. + inputs_dict["return_audio_waveforms"] = False + inputs_dict["return_audio_codes"] = False + inputs_dict["concat_unconditional_inputs"] = False + + return config, input_ids, attention_mask, inputs_dict + + def prepare_config_and_inputs_for_generate(self, batch_size=2): + config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate() + + # Make sure we only return `input_ids`. + # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there. + # There are further tests to test that audio waveforms and codes are well generated. + filtered_inputs_dict["return_audio_waveforms"] = False + filtered_inputs_dict["return_audio_codes"] = False + filtered_inputs_dict["concat_unconditional_inputs"] = False + + return config, filtered_inputs_dict + + def _check_hidden_states_for_generate( + self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 + ): + # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` + self.assertIsInstance(hidden_states, tuple) + self.assertListEqual( + [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states], + [True] * len(hidden_states), + ) + self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups) + + for idx, iter_hidden_states in enumerate(hidden_states): + seq_len = min_length if idx == 0 else 1 + expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size) + # check hidden size + self.assertListEqual( + [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states], + [expected_shape] * len(iter_hidden_states), + ) + + def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): + # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` + super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences) + + def _check_hidden_states_for_generate( + self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 + ): + # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` + self.assertIsInstance(hidden_states, tuple) + self.assertListEqual( + [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states], + [True] * len(hidden_states), + ) + self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups) + + for idx, iter_hidden_states in enumerate(hidden_states): + seq_len = 1 + expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size) + # check hidden size + self.assertListEqual( + [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states], + [expected_shape] * len(iter_hidden_states), + ) + + def _check_attentions_for_generate( + self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1 + ): + # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` + self.assertIsInstance(attentions, tuple) + self.assertListEqual( + [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions) + ) + self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups) + + for idx, iter_attentions in enumerate(attentions): + tgt_len = 1 + src_len = min_length + idx + + expected_shape = ( + batch_size * num_beam_groups, + config.num_attention_heads, + tgt_len, + src_len, + ) + # check attn size + self.assertListEqual( + [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions) + ) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = ["conv", "input_proj", "output_proj"] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @pytest.mark.generate + def test_generate_from_inputs_embeds_decoder_only(self): + for model_class in self.all_generative_model_classes: + config, input_ids, _, inputs_dict = self._get_input_ids_and_config() + + model = model_class(config).to(torch_device).eval() + + # Traditional way of generating text + outputs_from_ids = model.generate( + input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True, **inputs_dict + ) + self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) + + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + inputs_embeds = model.get_input_embeddings()(input_ids) + outputs_from_embeds = model.generate( + input_ids, + inputs_embeds=inputs_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + **inputs_dict, + ) + + # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # same, but the logits will almost surely be different) + random_embeds = torch.rand_like(inputs_embeds) + outputs_from_rand_embeds = model.generate( + input_ids, + inputs_embeds=random_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + **inputs_dict, + ) + for i in range(len(outputs_from_rand_embeds.scores)): + self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) + + # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same + outputs_from_embeds_wo_ids = model.generate( + inputs_embeds=inputs_embeds, + max_new_tokens=5, + return_dict_in_generate=True, + output_scores=True, + **inputs_dict, + ) + self.assertListEqual( + outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), + outputs_from_embeds_wo_ids.sequences.tolist(), + ) + + @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("Moshi doesn't support contrastive generation yet.") + def test_contrastive_generate(self): + pass + + @unittest.skip("Moshi doesn't support contrastive generation yet.") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Moshi doesn't support contrastive generation yet.") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("Adapting this test is costly. `test_eager_matches_sdpa_generate` tests this already.") + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + pass + + @unittest.skip(reason="The Moshi model does not have support dynamic compile yet") + def test_sdpa_can_compile_dynamic(self): + pass + + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # Then, test left-padding + + for model_class in self.all_generative_model_classes: + config, input_ids, attention_mask, input_dict = self._get_input_ids_and_config() + model = model_class(config).to(torch_device).eval() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + next_logits_wo_padding = model(input_ids=input_ids, attention_mask=attention_mask, **input_dict).logits[ + :, -1, : + ] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + + padding = ( + torch.ones( + (pad_size[0], self.model_tester.num_codebooks, 32), dtype=input_ids.dtype, device=torch_device + ) + * config.audio_vocab_size + ) + padded_moshi_audio_codes = torch.cat((padding, input_dict["moshi_audio_codes"]), dim=2) + padded_user_audio_codes = torch.cat((padding, input_dict["user_audio_codes"]), dim=2) + + model_kwargs = { + "input_ids": padded_input_ids, + "attention_mask": padded_attention_mask, + "moshi_audio_codes": padded_moshi_audio_codes, + "user_audio_codes": padded_user_audio_codes, + } + + next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @require_torch_sdpa + @slow + @is_flaky(max_attempts=5, description="flaky on some models.") + def test_eager_matches_sdpa_generate(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + max_new_tokens = 5 + + if len(self.all_generative_model_classes) == 0: + self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") + + for model_class in self.all_generative_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + inputs_dict[model_class.main_input_name] = dummy_input + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + **inputs_dict, + max_new_tokens=max_new_tokens, + do_sample=False, + depth_decoder_do_sample=False, + ) + + res_sdpa = model_sdpa.generate( + **inputs_dict, + max_new_tokens=max_new_tokens, + do_sample=False, + depth_decoder_do_sample=False, + ) + + self.assertTrue(torch.allclose(res_eager.sequences, res_sdpa.sequences)) + self.assertTrue(torch.allclose(res_eager.audio_sequences, res_sdpa.audio_sequences)) + + @pytest.mark.generate + def test_generate_without_input_ids(self): + config, _, _, _ = self._get_input_ids_and_config() + + for model_class in self.all_generative_model_classes: + model = model_class(config).to(torch_device) + model.eval() + + output_ids_generate = model.generate( + do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True + ) + self.assertIsNotNone(output_ids_generate) + + @unittest.skip(reason="The audio encoder has no gradients.") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="The audio encoder has no gradients.") + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip(reason="The audio encoder has no gradients.") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_generate_from_input_values(self): + for model_class in self.all_generative_model_classes: + config, input_ids, _, _ = self._get_input_ids_and_config() + + model = model_class(config).to(torch_device).eval() + + input_values_length = int( + self.model_tester.seq_length * config.sampling_rate / config.audio_encoder_config.frame_rate + ) + + user_input_values = floats_tensor((input_ids.shape[0], 1, input_values_length)) + moshi_input_values = floats_tensor((input_ids.shape[0], 1, input_values_length)) + + user_audio_codes = model.audio_encoder.encode(user_input_values, num_quantizers=model.num_codebooks)[0] + moshi_audio_codes = model.audio_encoder.encode(moshi_input_values, num_quantizers=model.num_codebooks)[0] + + outputs_from_audio_codes = model.generate( + input_ids, max_new_tokens=5, user_audio_codes=user_audio_codes, moshi_audio_codes=moshi_audio_codes + ) + + outputs_from_audio_values = model.generate( + input_ids, max_new_tokens=5, user_input_values=user_input_values, moshi_input_values=moshi_input_values + ) + + self.assertTrue((outputs_from_audio_values.sequences == outputs_from_audio_codes.sequences).all()) + self.assertTrue( + torch.allclose(outputs_from_audio_codes.audio_sequences, outputs_from_audio_values.audio_sequences) + ) + + def test_generate_depth_decoder_kwargs(self): + # test sampling and beam search + for model_class in self.all_generative_model_classes: + config, input_ids, _, input_dict = self._get_input_ids_and_config() + + model = model_class(config).to(torch_device).eval() + + model.generate(input_ids, max_new_tokens=5, **input_dict, depth_decoder_do_sample=True) + + model.generate( + input_ids, max_new_tokens=5, **input_dict, depth_decoder_do_sample=True, depth_decoder_num_beams=5 + ) + + def test_generate_from_unconditional(self): + # test sampling and beam search + for model_class in self.all_generative_model_classes: + config, input_ids, _, input_dict = self._get_input_ids_and_config() + + model = model_class(config).to(torch_device).eval() + + # check bs>1 + model.generate( + **model.get_unconditional_inputs(num_samples=4), max_new_tokens=5, concat_unconditional_inputs=False + ) + + # check same results from uncondtional or no inputs + outputs_from_unconditional = model.generate( + **model.get_unconditional_inputs(num_samples=1), max_new_tokens=5, concat_unconditional_inputs=False + ) + outputs_from_none = model.generate(max_new_tokens=5) + + self.assertTrue((outputs_from_unconditional.sequences == outputs_from_none.sequences).all()) + self.assertTrue( + torch.allclose(outputs_from_unconditional.audio_sequences, outputs_from_none.audio_sequences) + ) + + @unittest.skip(reason="Compile not yet supported because in Moshi models") + def test_sdpa_can_dispatch_on_flash(self): + pass + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_cpu_offload(self): + pass + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_disk_offload_bin(self): + pass + + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + def test_disk_offload_safetensors(self): + pass + + @is_flaky(max_attempts=5, description="flaky on some models.") + def test_save_load(self): + super().test_save_load() + + +def place_dict_on_device(dict_to_place, device): + for key in dict_to_place: + if dict_to_place[key] is not None and isinstance(dict_to_place[key], torch.Tensor): + dict_to_place[key] = dict_to_place[key].to(device) + return dict_to_place + + +@require_torch +class MoshiIntegrationTests(unittest.TestCase): + @cached_property + def feature_extractor(self): + return AutoFeatureExtractor.from_pretrained("kmhf/hf-moshiko") + + @cached_property + def tokenizer(self): + return AutoTokenizer.from_pretrained("kmhf/hf-moshiko") + + def _load_datasample(self): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + dataset = ds.cast_column("audio", Audio(sampling_rate=self.feature_extractor.sampling_rate)) + # automatic decoding with librispeech + speech_sample = dataset.sort("id")[0]["audio"]["array"] + return speech_sample + + @slow + def test_moshika_conditional_greedy(self): + model = MoshiForConditionalGeneration.from_pretrained( + "kmhf/hf-moshika", torch_dtype=torch.float16, device_map="auto" + ) + inputs = self.feature_extractor(self._load_datasample(), return_tensors="pt").to( + device=torch_device, dtype=torch.float16 + ) + + user_audio_codes = model.audio_encoder.encode(**inputs, num_quantizers=8).audio_codes + + input_ids = self.tokenizer.encode(" Hello,", return_tensors="pt").to( + torch_device + ) + + # fmt: off + moshi_audio_codes = [[[1049, 127, 1880, 972, 972, 1156, 1913, 415, 1933], + [1700, 243, 91, 91, 91, 745, 1478, 638, 57], + [1626, 457, 457, 457, 457, 1839, 200, 2011, 1142], + [546, 290, 390, 390, 290, 1408, 1812, 1187, 1911], + [306, 306, 1314, 1314, 1314, 759, 796, 854, 1466], + [1443, 1443, 1030, 317, 347, 1178, 613, 1576, 2023], + [1871, 428, 1433, 1433, 1978, 1405, 1755, 820, 610], + [2008, 1744, 1511, 568, 1533, 550, 237, 1412, 1401]]] + # fmt: on + + moshi_audio_codes = torch.tensor(moshi_audio_codes, device=torch_device) + user_audio_codes = user_audio_codes[:, :, : moshi_audio_codes.shape[-1]] + + model_outputs = model.generate( + user_audio_codes=user_audio_codes, + moshi_audio_codes=moshi_audio_codes, + input_ids=input_ids, + do_sample=False, + depth_decoder_do_sample=False, + return_audio_codes=True, + max_new_tokens=2, + ) + + expected_text_token = 452 + expected_audio_tokens = [916, 1396, 1238, 579, 1105, 914, 1257, 810] # fmt: skip + + self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].cpu().item()) + self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].cpu().tolist()) + + @slow + def test_moshiko_greedy_unconditional_fp16_eager(self): + model = MoshiForConditionalGeneration.from_pretrained( + "kmhf/hf-moshiko", torch_dtype=torch.float16, device_map="auto" + ) + some_expected_audio_tokens = [[1049, 127], [1700, 243], [1626, 457], [546, 290], [306, 306], [1443, 1443], [1871, 428], [2008, 1744]] # fmt: skip + + model_outputs = model.generate( + do_sample=False, depth_decoder_do_sample=False, return_audio_codes=True, max_new_tokens=10 + ) + + # eager equivalence is not as strict as sdpa. + self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist()) + + @slow + def test_moshiko_greedy_unconditional_fp32(self): + model = MoshiForConditionalGeneration.from_pretrained( + "kmhf/hf-moshiko", torch_dtype=torch.float32, device_map="auto" + ) + + expected_audio_codesum = 72065 + expected_text_tokens = [3, 3, 3, 0, 11725, 261, 3, 3, 3, 3] # fmt: skip + some_expected_audio_tokens = [[1049, 127], [1700, 243], [1626, 457], [546, 290], [306, 306], [1443, 1443], [1871, 428], [2008, 1744]] # fmt: skip + + model_outputs = model.generate( + do_sample=False, depth_decoder_do_sample=False, return_audio_codes=True, max_new_tokens=10 + ) + + # make sure audio encoded codes are correct + audio_code_sums = model_outputs.audio_codes.sum().item() + self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums)) + + self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist()) + self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist()) + + @slow + @require_torch_fp16 + def test_moshiko_greedy_unconditional_fp16(self): + model = MoshiForConditionalGeneration.from_pretrained( + "kmhf/hf-moshiko", torch_dtype=torch.float16, device_map="auto" + ) + + expected_audio_codesum = 72065 + expected_text_tokens = [3, 3, 3, 0, 11725, 261, 3, 3, 3, 3] # fmt: skip + some_expected_audio_tokens = [[1049, 127], [1700, 243], [1626, 457], [546, 290], [306, 306], [1443, 1443], [1871, 428], [2008, 1744]] # fmt: skip + + model_outputs = model.generate( + do_sample=False, depth_decoder_do_sample=False, return_audio_codes=True, max_new_tokens=10 + ) + + # make sure audio encoded codes are correct + audio_code_sums = model_outputs.audio_codes.sum().item() + self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums)) + + self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist()) + self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist()) + + @slow + @require_torch_fp16 + def test_moshika_greedy_unconditional_fp16(self): + model = MoshiForConditionalGeneration.from_pretrained( + "kmhf/hf-moshika", torch_dtype=torch.float16, device_map="auto" + ) + + expected_audio_codesum = 72932 + expected_text_tokens = [3, 3, 3, 0, 667, 263, 3, 3, 0, 705] # fmt: skip + some_expected_audio_tokens = [[1049, 127], [1700, 243], [1626, 457], [546, 290], [306, 306], [1443, 347], [1871, 428], [2008, 2008]] # fmt: skip + + model_outputs = model.generate( + do_sample=False, depth_decoder_do_sample=False, return_audio_codes=True, max_new_tokens=10 + ) + + # make sure audio encoded codes are correct + audio_code_sums = model_outputs.audio_codes.sum().item() + self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= 2048) + + self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist()) + self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist()) diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py new file mode 100644 index 00000000000000..ad3a34a197f0e4 --- /dev/null +++ b/tests/models/moshi/test_tokenization_moshi.py @@ -0,0 +1,447 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import pickle +import shutil +import tempfile +import unittest + +from transformers import ( + SPIECE_UNDERLINE, + AddedToken, + AutoTokenizer, + PreTrainedTokenizerFast, + SpecialTokensMixin, +) +from transformers.convert_slow_tokenizer import MoshiConverter +from transformers.testing_utils import ( + get_tests_dir, + nested_simplify, + require_sentencepiece, + require_tokenizers, + require_torch, +) + +from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = ["kmhf/hf-moshiko"] + rust_tokenizer_class = PreTrainedTokenizerFast + + test_slow_tokenizer = False + test_rust_tokenizer = True + from_pretrained_kwargs = {} + + def setUp(self): + super().setUp() + + # We have a SentencePiece fixture for testing + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=MoshiConverter(vocab_file=SAMPLE_VOCAB).converted(), + bos_token="", + unk_token="", + eos_token="", + ) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.save_pretrained(self.tmpdirname) + + def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: + return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + + @unittest.skip(reason="No slow tokenizer") + def test_added_tokens_serialization(self): + pass + + @unittest.skip(reason="PreTrainedTokenizerFast doesn't have tokenizer_file in its signature") + def test_rust_tokenizer_signature(self): + pass + + @unittest.skip(reason="No slow tokenizer") + def test_encode_decode_with_spaces(self): + pass + + def test_full_tokenizer(self): + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=MoshiConverter(vocab_file=SAMPLE_VOCAB).converted(), + bos_token="", + unk_token="", + eos_token="", + ) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), + [285, 46, 10, 170, 382], + ) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + def test_picklable(self): + with tempfile.NamedTemporaryFile() as f: + shutil.copyfile(SAMPLE_VOCAB, f.name) + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=MoshiConverter(vocab_file=f.name).converted(), + bos_token="", + unk_token="", + eos_token="", + ) + pickled_tokenizer = pickle.dumps(tokenizer) + pickle.loads(pickled_tokenizer) + + def test_training_new_tokenizer(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) + + # Test we can use the new tokenizer with something not seen during training + inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."]) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "This is the first sentence" + + self.assertEqual(expected_result, decoded_input) + + # We check that the parameters of the tokenizer remained the same + # Check we have the same number of added_tokens for both pair and non-pair inputs. + self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False)) + self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True)) + + # Check we have the correct max_length for both pair and non-pair inputs. + self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence) + self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair) + + # Assert the set of special tokens match as we didn't ask to change them + self.assertSequenceEqual( + tokenizer.all_special_tokens_extended, + new_tokenizer.all_special_tokens_extended, + ) + + self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) + + def test_training_new_tokenizer_with_special_tokens_change(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + # Test with a special tokens map + class_signature = inspect.signature(tokenizer.__class__) + if "cls_token" in class_signature.parameters: + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: ""} + ) + cls_id = new_tokenizer.get_vocab()[""] + self.assertEqual(new_tokenizer.cls_token, "") + self.assertEqual(new_tokenizer.cls_token_id, cls_id) + + # Create a new mapping from the special tokens defined in the original tokenizer + special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() + special_tokens_list.remove("additional_special_tokens") + special_tokens_map = {} + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, f"_{token}") is not None: + special_token = getattr(tokenizer, token) + special_tokens_map[special_token] = f"{special_token}a" + + # Train new tokenizer + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map + ) + + # Check the changes + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, f"_{token}") is None: + continue + special_token = getattr(tokenizer, token) + if special_token in special_tokens_map: + new_special_token = getattr(new_tokenizer, token) + self.assertEqual(special_tokens_map[special_token], new_special_token) + + new_id = new_tokenizer.get_vocab()[new_special_token] + self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id) + + # Check if the AddedToken / string format has been kept + for special_token in tokenizer.all_special_tokens_extended: + if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + elif isinstance(special_token, AddedToken): + # The special token must appear in the list of the new tokenizer as an object of type AddedToken with + # the same parameters as the old AddedToken except the content that the user has requested to change. + special_token_str = special_token.content + new_special_token_str = special_tokens_map[special_token_str] + + find = False + for candidate in new_tokenizer.all_special_tokens_extended: + if ( + isinstance(candidate, AddedToken) + and candidate.content == new_special_token_str + and candidate.lstrip == special_token.lstrip + and candidate.rstrip == special_token.rstrip + and candidate.normalized == special_token.normalized + and candidate.single_word == special_token.single_word + ): + find = True + break + special_token.content = new_special_token_str + self.assertTrue( + find, + f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = " + f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing" + ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.", + ) + elif special_token not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + + else: + # The special token must appear in the list of the new tokenizer as an object of type string. + self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended) + + # Test we can use the new tokenizer with something not seen during training + inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."]) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "This is the first sentence" + + self.assertEqual(expected_result, decoded_input) + + def test_alignement_methods(self): + # TODO: @ArthurZucker - alignment is broken + pass + + def test_added_tokens_do_lower_case(self): + # TODO: @ArthurZucker + pass + + +@require_torch +@require_sentencepiece +@require_tokenizers +class MoshiIntegrationTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + checkpoint_name = "kmhf/hf-moshiko" + cls.rust_tokenizer = AutoTokenizer.from_pretrained(checkpoint_name) + return cls + + @require_torch + def integration_tests(self): + inputs = self.tokenizer( + ["The following string should be properly encoded: Hello.", "But ird and ปี ird ด"], + return_tensors="pt", + ) + + long_attention_mask = [1] * 21 + + # fmt: off + self.assertEqual( + nested_simplify(inputs), + { + "input_ids": [ + [287, 547, 2359, 457, 297, 3708, 11488, 279, 11725, 263], + [588, 478, 1442, 267, 260, 228, 188, 159, 228, 188, 185, 260, 260, 478, 1442, 260, 260, 260, 228, 188, 152], + ], + "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], long_attention_mask], + }, + ) + # fmt: on + + def test_fast_special_tokens(self): + fast_tokenizer = self.rust_tokenizer + + fast_tokenizer.add_eos_token = False + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [318, 1145, 694] + + fast_tokenizer.add_eos_token = True + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [318, 1145, 694] + + self.rust_tokenizer.add_eos_token = False + + def test_simple_encode_decode(self): + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(rust_tokenizer.encode("This is a test"), [353, 275, 272, 694]) + self.assertEqual(rust_tokenizer.decode([353, 275, 272, 694], skip_special_tokens=True), "This is a test") + + # bytefallback showcase + bytefallback_tokens = [260, 235, 152, 163, 234, 184, 191, 13340, 235, 160, 163, 236, 180, 159, 234, 156, 179] # fmt: skip + self.assertEqual(rust_tokenizer.encode("生活的真谛是"), bytefallback_tokens) + self.assertEqual( + rust_tokenizer.decode(bytefallback_tokens, skip_special_tokens=True), + "生活的真谛是", + ) + + # Inner spaces showcase + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2769, 260, 11725]) + self.assertEqual(rust_tokenizer.decode([2769, 260, 11725], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2769, 260, 260, 11725]) + self.assertEqual(rust_tokenizer.decode([2769, 260, 260, 11725], skip_special_tokens=True), "Hi Hello") + + # TODO: @ArthurZucker + # self.assertEqual(rust_tokenizer.encode(""), []) + + # self.assertEqual(rust_tokenizer.encode(" "), [260, 260]) + + # self.assertEqual(rust_tokenizer.encode(" "), [260, 260, 260]) + + # self.assertEqual(rust_tokenizer.encode(" Hello"), [260, 11725]) + + # self.assertEqual(rust_tokenizer.encode(""), [607, 266, 578]) + + def test_no_differences_decode(self): + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(rust_tokenizer.decode([869]), "levels") + + self.assertEqual(rust_tokenizer.decode([30112, 869]), "unanswered levels") + + +@require_sentencepiece +@require_tokenizers +class CommonSpmIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + def test_edge_case_tabulation(self): + fast_tokenizer = AutoTokenizer.from_pretrained("kmhf/hf-moshiko") + input_text = "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61" + EXPECTED_IDS = [11510, 934, 4451, 266, 578, 263, 260, 13, 13, 260, 14, 14, 5209, 260, 260, 1202, 260, 527, 1322, 244, 163, 156, 140, 260, 260, 244, 163, 168, 155, 430, 1047, 261, 260, 265, 270, 278, 281, 260, 265, 280, 260, 280, 261, 285, 265] # fmt: skip + EXPECTED_TOKENS = ['▁Hey', '<', 'eo', 's', '>', '.', '▁', '<0x09>', '<0x09>', '▁', '<0x0A>', '<0x0A>', 'you', '▁', '▁', 'é', '▁', '▁@', '#', '<0xF0>', '<0x9F>', '<0x98>', '<0x88>', '▁', '▁', '<0xF0>', '<0x9F>', '<0xA4>', '<0x97>', '!', '▁▁▁▁▁▁▁', ',', '▁', '1', '2', '3', '4', '▁', '1', '5', '▁', '5', ',', '6', '1'] # fmt: skip + + tokens = fast_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + input_ids = fast_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + text = fast_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61") + + input_text = "\t\t\t\t \n\n61" + EXPECTED_IDS = [260, 13, 13, 13, 13, 260, 14, 14, 285, 265] + EXPECTED_TOKENS = ["▁", "<0x09>", "<0x09>", "<0x09>", "<0x09>", "▁", "<0x0A>", "<0x0A>", "6", "1"] + + tokens = fast_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + input_ids = fast_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + text = fast_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "\t\t\t\t \n\n61") diff --git a/utils/check_repo.py b/utils/check_repo.py index 98f96bcc78a339..6872dada3d9384 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -327,6 +327,7 @@ "SiglipVisionModel", "SiglipTextModel", "ChameleonVQVAE", # no autoclass for VQ-VAE models + "MoshiForConditionalGeneration", # no auto class for speech-to-speech ] # DO NOT edit this list! From 98bad9c6d6c9ac98b42164fd882f94d4b5bfa4d7 Mon Sep 17 00:00:00 2001 From: alpertunga-bile Date: Wed, 16 Oct 2024 15:22:55 +0300 Subject: [PATCH 030/385] [fix] fix token healing tests and usage errors (#33931) * auto-gptq requirement is removed & model is changed & tokenizer pad token is assigned * values func is changed with extensions & sequence key value bug is fixed * map key value check is added in ExtensionsTree * empty trimmed_ids bug is fixed * tail_id IndexError is fixed * empty trimmed_ids bug fix is updated for failed test * too much specific case for specific tokenizer is removed * input_ids check is updated * require auto-gptq import is removed * key error check is changed with empty list check * empty input_ids check is added * empty trimmed_ids fix is checked with numel function * usage change comments are added * test changes are commented * comment style and quality bugs are fixed * test comment style and quality bug is fixed --- src/transformers/generation/utils.py | 35 +++++++++++++++++++++----- src/transformers/tokenization_utils.py | 3 +++ tests/generation/test_utils.py | 18 ++++++------- tests/utils/test_tokenization_utils.py | 5 ++-- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 6d71b754d6f4e5..86ea702dd9f2fe 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1419,7 +1419,7 @@ def _prepare_generated_length( input_ids_length, inputs_tensor, ): - """Prepared max and min length in generaion configs to avoid clashes between similar attributes""" + """Prepared max and min length in generation configs to avoid clashes between similar attributes""" if generation_config.max_new_tokens is not None: if not has_default_max_length and generation_config.max_length is not None: @@ -1662,7 +1662,7 @@ def _prepare_cache_for_generation( device: torch.device, ) -> bool: """ - Prepares the cache for generation (if applicable), given `generate`'s paramaterization. If a cache is + Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is instantiated, writes it to `model_kwargs`, under the name expected by the model. """ @@ -1925,7 +1925,7 @@ def generate( deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`. assistant_model (`PreTrainedModel`, *optional*): An assistant model that can be used to accelerate generation. The assistant model must have the exact - same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model + same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model is much faster than running generation with the model you're calling generate from. As such, the assistant model should be much smaller. streamer (`BaseStreamer`, *optional*): @@ -2442,7 +2442,15 @@ def heal_tokens( # replace bos with pad to not condition healing on it input_ids = torch.where(input_ids == bos_token_id, pad_token_id, input_ids) + """ + the latter code assumes the input_ids is not empty, + input_id has to be checked if contains elements + """ + if input_ids.numel() == 0: + return input_ids + tail_ids = input_ids[:, -1].tolist() + space_tok = tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(" "))[0] # tail tokens are used for a prefix search, thus, whitespaces are replaced with # their tokenization (e.g. 'Ġ') to enable search for tokens prefixed with a whitespace @@ -2454,7 +2462,14 @@ def heal_tokens( continue # skip empty sequences (all pad ids) # apply bias for alternatives (extensions) to the tail token - seq_bias = {(alt_tok,): 10.0 for alt_tok in vocab_trie.values(prefix=tail_tok)} + """ + seq_bias key has to be tuple with int so have to use + tokenizer function to convert str to int + """ + seq_bias = { + (tokenizer.convert_tokens_to_ids(alt_tok),): 10.0 for alt_tok in vocab_trie.extensions(prefix=tail_tok) + } + if len(seq_bias) == 1: continue # skip if there are no token alternatives to heal with @@ -2463,6 +2478,14 @@ def heal_tokens( generation_config.update(sequence_bias=seq_bias) trimmed_ids = batch_ids[:-1] + + """ + the latter code assumes trimmed_ids is not empty + so have to check the its element count + """ + if trimmed_ids.numel() == 0: + continue + # if the prompt is a single (non-pad) token, regenerate from bos if len(batch_ids[batch_ids != pad_token_id]) == 1: trimmed_ids[-1] = bos_token_id @@ -2915,7 +2938,7 @@ def _contrastive_search( output_attentions=output_attentions, ) - # This is essential to avoid having a last reference to the big past K-V and double the necesary memory + # This is essential to avoid having a last reference to the big past K-V and double the necessary memory # in the next loop del next_model_inputs @@ -3658,7 +3681,7 @@ def _group_beam_search( ) # initialise score of first beam of each group with 0 and the rest with -1e9. This ensures that the beams in - # the same group don't produce same tokens everytime. + # the same group don't produce same tokens every time. beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device) beam_scores[:, ::num_sub_beams] = 0 beam_scores = beam_scores.view((batch_size * num_beams,)) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index df13a029a6c6ff..d2433868cf1897 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -316,6 +316,9 @@ def _get_node(self, token: str) -> dict: """ node = self.data for char in token: + if char not in node: + break + node = node[char] return node diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 5165e43c099416..6766fa22b9b8a0 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -28,7 +28,6 @@ from transformers.testing_utils import ( is_flaky, require_accelerate, - require_auto_gptq, require_optimum_quanto, require_torch, require_torch_gpu, @@ -3912,11 +3911,6 @@ def test_generate_compile_fullgraph_tiny(self): class TokenHealingTestCase(unittest.TestCase): @parameterized.expand( [ - ( - "square_bracket", - 'An example ["like this"] and another example [', - 'An example ["like this"] and another example ["', - ), ("url", 'The link is Date: Wed, 16 Oct 2024 17:43:41 +0330 Subject: [PATCH 031/385] Revert `accelerate` error caused by `46d09af` (#34197) Revert `accelerate` bug --- src/transformers/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index a2d83b2915eba4..485610dd9baa28 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1936,7 +1936,7 @@ def __post_init__(self): warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.") # accelerate integration for FSDP - if len(self.fsdp) > 0 and is_accelerate_available("0.28.0"): + if len(self.fsdp) > 0 and not self.fsdp_config["xla"]: os.environ["ACCELERATE_USE_FSDP"] = "true" from accelerate.utils.constants import ( FSDP_AUTO_WRAP_POLICY, From bd5dc10fd25979037e3fb1e31d5a361425b554f0 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:48:52 +0200 Subject: [PATCH 032/385] Fix wrong name for llava onevision and qwen2_vl in tokenization auto (#34177) * nit fix wrong llava onevision name in tokenization auto * add qwen2_vl and fix style --- src/transformers/models/auto/tokenization_auto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 3a3428e0995147..63549202969ab9 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -258,9 +258,9 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava-onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( "longt5", @@ -413,6 +413,7 @@ "Qwen2TokenizerFast" if is_tokenizers_available() else None, ), ), + ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), ("rag", ("RagTokenizer", None)), ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)), ( From 3a10c6192b12b328685dce89004f9f9b9c042d30 Mon Sep 17 00:00:00 2001 From: Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com> Date: Wed, 16 Oct 2024 17:01:18 +0200 Subject: [PATCH 033/385] Avoid using torch's Tensor or PIL's Image in chat template utils if not available (#34165) * fix(utils): Avoid using torch Tensor or PIL Image if not available * Trigger CI --------- Co-authored-by: Matt --- src/transformers/utils/chat_template_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/utils/chat_template_utils.py b/src/transformers/utils/chat_template_utils.py index 74912ce30146c6..c64a2c4dcb3468 100644 --- a/src/transformers/utils/chat_template_utils.py +++ b/src/transformers/utils/chat_template_utils.py @@ -76,10 +76,12 @@ def _get_json_schema_type(param_type: str) -> Dict[str, str]: float: {"type": "number"}, str: {"type": "string"}, bool: {"type": "boolean"}, - Image: {"type": "image"}, - Tensor: {"type": "audio"}, Any: {}, } + if is_vision_available(): + type_mapping[Image] = {"type": "image"} + if is_torch_available(): + type_mapping[Tensor] = {"type": "audio"} return type_mapping.get(param_type, {"type": "object"}) From 3f06f95ebe617b192251ef756518690f5bc7ff76 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:25:18 +0200 Subject: [PATCH 034/385] Revert "Fix FSDP resume Initialization issue" (#34193) Revert "Fix FSDP resume Initialization issue (#34032)" This reverts commit 4de1bdbf637fe6411c104c62ab385f660bfb1064. --- src/transformers/trainer.py | 37 ----------------------------------- tests/trainer/test_trainer.py | 31 ----------------------------- 2 files changed, 68 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5131676c953dc1..20b9f6dad231d1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -273,39 +273,6 @@ def _get_fsdp_ckpt_kwargs(): return {} -def _init_fsdp(model, accelerator, device): - """ - Initialize Fully Sharded Data Parallel (FSDP) for the model. - - This function is needed to properly initialize FSDP when resuming from a checkpoint. - It runs a forward pass with dummy inputs to ensure FSDP is fully initialized. - See https://github.com/huggingface/transformers/issues/31892 for more details. - - Args: - model: The model to initialize with FSDP. - accelerator: The Accelerator object. - device: The device to run the model on. - - Returns: - The initialized FSDP model. - """ - model = accelerator.prepare(model) - model.train() - with torch.no_grad(): - # Run a forward pass with dummy inputs to initialize FSDP - dummy_input = { - name: torch.ones( - (1, 512), - dtype=torch.long, - device=device, - ) - for name in model.forward.__code__.co_varnames - if name != "self" - } - _ = model(**dummy_input) - return model - - if TYPE_CHECKING: import optuna @@ -634,10 +601,6 @@ def __init__( " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and" " `model.to(xm.xla_device())` is performed before the optimizer creation in your script." ) - - if self.is_fsdp_enabled: - self.model = _init_fsdp(self.model, self.accelerator, self.args.device) - if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and ( self.optimizer is not None or self.lr_scheduler is not None ): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 8feb5d92e89e43..cbc93faf50e7a3 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -4914,34 +4914,3 @@ def test_get_optimizer_group(self): param = next(model.parameters()) group = trainer.get_optimizer_group(param) self.assertIn(param, group["params"]) - - -@require_torch_gpu -@require_torch -@require_accelerate -class TestFSDPInitialization(unittest.TestCase): - def test_fsdp_initialization(self): - config = RegressionModelConfig(a=1, b=1, double_output=False) - model = RegressionPreTrainedModel(config) - - with tempfile.TemporaryDirectory() as tmp_dir: - training_args = TrainingArguments( - output_dir=tmp_dir, - fsdp=True, - fsdp_config={"min_num_params": 1}, - no_cuda=True, - ) - trainer = Trainer(model=model, args=training_args) - - # Check for FSDP enabled - self.assertTrue(trainer.is_fsdp_enabled) - - # Check if model is wrapped with FSDP - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - - self.assertTrue(trainer.model, FSDP) - - # Running a forward pass to ensure FSDP is initialized - dummy_input = torch.ones((1, 1), dtype=torch.float) - output = trainer.model(dummy_input) - self.assertTrue(output) From 6d2b2033393df0594a5d43ba2aaa9ec9bc46f4e0 Mon Sep 17 00:00:00 2001 From: larin92 Date: Thu, 17 Oct 2024 15:43:29 +0300 Subject: [PATCH 035/385] Update `trainer._get_eval_sampler()` to support `group_by_length` arg (#33514) Update 'trainer._get_eval_sampler()' to support 'group_by_length' argument Trainer didn't support grouping by length for evaluation, which made evaluation slow with 'eval_batch_size'>1. Updated 'trainer._get_eval_sampler()' method was based off of 'trainer._get_train_sampler()'. --- src/transformers/trainer.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 20b9f6dad231d1..7e4d1e5d267bb8 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -959,6 +959,10 @@ def get_train_dataloader(self) -> DataLoader: return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]: + if self.eval_dataset is None or not has_length(self.eval_dataset): + return None + # Build the sampler. + # Deprecated code if self.args.use_legacy_prediction_loop: if is_torch_xla_available(): @@ -975,6 +979,23 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data. else: return SequentialSampler(eval_dataset) + if self.args.group_by_length: + if is_datasets_available() and isinstance(self.eval_dataset, datasets.Dataset): + lengths = ( + self.eval_dataset[self.args.length_column_name] + if self.args.length_column_name in self.eval_dataset.column_names + else None + ) + else: + lengths = None + model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None + return LengthGroupedSampler( + self.args.eval_batch_size, + dataset=self.eval_dataset, + lengths=lengths, + model_input_name=model_input_name, + ) + if self.args.world_size <= 1: return SequentialSampler(eval_dataset) else: From aa3e35ac67369d58c33bacc161cc9c64d5062c48 Mon Sep 17 00:00:00 2001 From: Amos You <91300605+amosyou@users.noreply.github.com> Date: Thu, 17 Oct 2024 06:11:33 -0700 Subject: [PATCH 036/385] Fix warning message for fp32_cpu_offloading in bitsandbytes configs (#34079) * change cpu offload warning for fp8 quantization * change cpu offload warning for fp4 quantization * change cpu offload variable name for fp8 and fp4 quantization --- src/transformers/quantizers/quantizer_bnb_4bit.py | 6 +++--- src/transformers/quantizers/quantizer_bnb_8bit.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py index eed45192e7ad9c..98d57e22524902 100644 --- a/src/transformers/quantizers/quantizer_bnb_4bit.py +++ b/src/transformers/quantizers/quantizer_bnb_4bit.py @@ -102,7 +102,7 @@ def validate_environment(self, *args, **kwargs): raise ValueError( "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " - "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " "`from_pretrained`. Check " "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " "for more details. " @@ -285,7 +285,7 @@ def _process_model_before_weight_loading( ): from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear - load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload # We keep some modules such as the lm_head in their original dtype for numerical stability reasons if self.quantization_config.llm_int8_skip_modules is None: @@ -302,7 +302,7 @@ def _process_model_before_weight_loading( if isinstance(device_map, dict) and len(device_map.keys()) > 1: keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] - if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload: + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: raise ValueError( "If you want to offload some keys to `cpu` or `disk`, you need to set " "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be " diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py index 020ff7cc6214a7..093d612b914cef 100644 --- a/src/transformers/quantizers/quantizer_bnb_8bit.py +++ b/src/transformers/quantizers/quantizer_bnb_8bit.py @@ -101,7 +101,7 @@ def validate_environment(self, *args, **kwargs): raise ValueError( "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " - "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " "`from_pretrained`. Check " "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " "for more details. " @@ -250,7 +250,7 @@ def _process_model_before_weight_loading( ): from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear - load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload # We keep some modules such as the lm_head in their original dtype for numerical stability reasons if self.quantization_config.llm_int8_skip_modules is None: @@ -267,7 +267,7 @@ def _process_model_before_weight_loading( if isinstance(device_map, dict) and len(device_map.keys()) > 1: keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] - if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload: + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: raise ValueError( "If you want to offload some keys to `cpu` or `disk`, you need to set " "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be " From fce1fcfe717b0e8bee12e8a51944227b57f2f63a Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:11:52 +0200 Subject: [PATCH 037/385] Ping team members for new failed tests in daily CI (#34171) * ping * fix * fix * fix * remove runner * update members --------- Co-authored-by: ydshieh --- .../workflows/check_failed_model_tests.yml | 129 ++++++++++++ .github/workflows/self-scheduled.yml | 10 + utils/check_bad_commit.py | 188 ++++++++++++++++++ utils/get_previous_daily_ci.py | 12 ++ utils/notification_service.py | 27 ++- utils/process_bad_commit_report.py | 77 +++++++ 6 files changed, 442 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/check_failed_model_tests.yml create mode 100644 utils/check_bad_commit.py create mode 100644 utils/process_bad_commit_report.py diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml new file mode 100644 index 00000000000000..f229765994d585 --- /dev/null +++ b/.github/workflows/check_failed_model_tests.yml @@ -0,0 +1,129 @@ +name: Process failed tests + +on: + workflow_call: + inputs: + docker: + required: true + type: string + start_sha: + required: true + type: string + + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 + CUDA_VISIBLE_DEVICES: 0,1 + + +jobs: + run_models_gpu: + name: " " + runs-on: + group: aws-g4dn-2xlarge-cache + container: + image: ${{ inputs.docker }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - uses: actions/download-artifact@v4 + with: + name: ci_results_run_models_gpu + path: /transformers/ci_results_run_models_gpu + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Get target commit + working-directory: /transformers/utils + run: | + echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV + + - name: Checkout to `start_sha` + working-directory: /transformers + run: git fetch && git checkout ${{ inputs.start_sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Check failed tests + working-directory: /transformers + run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json + + - name: Show results + working-directory: /transformers + run: | + ls -l new_model_failures_with_bad_commit.json + cat new_model_failures_with_bad_commit.json + + - name: Checkout back + working-directory: /transformers + run: | + git checkout ${{ inputs.start_sha }} + + - name: Process report + shell: bash + working-directory: /transformers + env: + TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + run: | + python3 utils/process_bad_commit_report.py + + - name: Process report + shell: bash + working-directory: /transformers + env: + TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + run: | + { + echo 'REPORT_TEXT<> "$GITHUB_ENV" + + - name: Send processed report + if: ${{ env.REPORT_TEXT != '' }} + uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + channel-id: '#transformers-ci-feedback-tests' + # For posting a rich message using Block Kit + payload: | + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "${{ env.REPORT_TEXT }}" + } + } + ] + } + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 1a6f4a485430d4..353fb59843e4a5 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -562,3 +562,13 @@ jobs: ci_event: ${{ inputs.ci_event }} secrets: inherit + + check_new_model_failures: + if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }} + name: Check new model failures + needs: send_results + uses: ./.github/workflows/check_failed_model_tests.yml + with: + docker: ${{ inputs.docker }} + start_sha: ${{ github.sha }} + secrets: inherit \ No newline at end of file diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py new file mode 100644 index 00000000000000..091ed5c4a427f9 --- /dev/null +++ b/utils/check_bad_commit.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import os +import re +import subprocess + +import requests + + +def create_script(target_test): + """Create a python script to be run by `git bisect run` to determine if `target_test` passes or fails. + If a test is not found in a commit, the script with exit code `0` (i.e. `Success`). + + Args: + target_test (`str`): The test to check. + + Returns: + `str`: The script to be run by `git bisect run`. + """ + + script = f""" +import os +import subprocess + +result = subprocess.run( + ["python3", "-m", "pytest", "-v", f"{target_test}"], + capture_output = True, + text=True, +) +print(result.stdout) + +if len(result.stderr) > 0: + if "ERROR: not found: " in result.stderr: + print("test not found in this commit") + exit(0) + else: + print(f"pytest failed to run: {{result.stderr}}") + exit(-1) +elif f"{target_test} FAILED" in result.stdout: + print("test failed") + exit(2) + +exit(0) +""" + + with open("target_script.py", "w") as fp: + fp.write(script.strip()) + + +def find_bad_commit(target_test, start_commit, end_commit): + """Find (backward) the earliest commit between `start_commit` and `end_commit` at which `target_test` fails. + + Args: + target_test (`str`): The test to check. + start_commit (`str`): The latest commit. + end_commit (`str`): The earliest commit. + + Returns: + `str`: The earliest commit at which `target_test` fails. + """ + + create_script(target_test=target_test) + + bash = f""" +git bisect reset +git bisect start {start_commit} {end_commit} +git bisect run python3 target_script.py +""" + + with open("run_git_bisect.sh", "w") as fp: + fp.write(bash.strip()) + + result = subprocess.run( + ["bash", "run_git_bisect.sh"], + capture_output=True, + text=True, + ) + print(result.stdout) + + if "error: bisect run failed" in result.stderr: + index = result.stderr.find("error: bisect run failed") + bash_error = result.stderr[index:] + + error_msg = f"Error when running git bisect:\nbash error: {bash_error}" + + pattern = "pytest failed to run: .+" + pytest_errors = re.findall(pattern, result.stdout) + if len(pytest_errors) > 0: + pytest_error = pytest_errors[0] + index = pytest_error.find("pytest failed to run: ") + index += len("pytest failed to run: ") + pytest_error = pytest_error[index:] + error_msg += f"pytest error: {pytest_error}" + + raise ValueError(error_msg) + + pattern = r"(.+) is the first bad commit" + commits = re.findall(pattern, result.stdout) + + bad_commit = None + if len(commits) > 0: + bad_commit = commits[0] + + print(f"Between `start_commit` {start_commit} and `end_commit` {end_commit}") + print(f"bad_commit: {bad_commit}\n") + + return bad_commit + + +def get_commit_info(commit): + """Get information for a commit via `api.github.com`.""" + pr_number = None + author = None + merged_author = None + + url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}/pulls" + pr_info_for_commit = requests.get(url).json() + + if len(pr_info_for_commit) > 0: + pr_number = pr_info_for_commit[0]["number"] + + url = f"https://api.github.com/repos/huggingface/transformers/pulls/{pr_number}" + pr_for_commit = requests.get(url).json() + author = pr_for_commit["user"]["login"] + merged_author = pr_for_commit["merged_by"]["login"] + + if author is None: + url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}" + commit_info = requests.get(url).json() + author = commit_info["author"]["login"] + + return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--start_commit", type=str, required=True, help="The latest commit hash to check.") + parser.add_argument("--end_commit", type=str, required=True, help="The earliest commit hash to check.") + parser.add_argument("--test", type=str, help="The test to check.") + parser.add_argument("--file", type=str, help="The report file.") + parser.add_argument("--output_file", type=str, required=True, help="The path of the output file.") + args = parser.parse_args() + + print(f"start_commit: {args.start_commit}") + print(f"end_commit: {args.end_commit}") + + if len({args.test is None, args.file is None}) != 2: + raise ValueError("Exactly one argument `test` or `file` must be specified.") + + if args.test is not None: + commit = find_bad_commit(target_test=args.test, start_commit=args.start_commit, end_commit=args.end_commit) + with open(args.output_file, "w", encoding="UTF-8") as fp: + fp.write(f"{args.test}\n{commit}") + elif os.path.isfile(args.file): + with open(args.file, "r", encoding="UTF-8") as fp: + reports = json.load(fp) + + for model in reports: + # TODO: make this script able to deal with both `single-gpu` and `multi-gpu` via a new argument. + reports[model].pop("multi-gpu", None) + failed_tests = reports[model]["single-gpu"] + + failed_tests_with_bad_commits = [] + for test in failed_tests: + commit = find_bad_commit(target_test=test, start_commit=args.start_commit, end_commit=args.end_commit) + info = {"test": test, "commit": commit} + info.update(get_commit_info(commit)) + failed_tests_with_bad_commits.append(info) + reports[model]["single-gpu"] = failed_tests_with_bad_commits + + with open(args.output_file, "w", encoding="UTF-8") as fp: + json.dump(reports, fp, ensure_ascii=False, indent=4) diff --git a/utils/get_previous_daily_ci.py b/utils/get_previous_daily_ci.py index 975c6f33982013..efd7d24a752991 100644 --- a/utils/get_previous_daily_ci.py +++ b/utils/get_previous_daily_ci.py @@ -41,6 +41,18 @@ def get_last_daily_ci_runs(token): return workflow_run_id +def get_last_daily_ci_run_commit(token): + """Get the commit sha of the last completed scheduled daily CI workflow run.""" + workflow_runs = get_daily_ci_runs(token) + head_sha = None + for workflow_run in workflow_runs: + if workflow_run["status"] == "completed": + head_sha = workflow_run["head_sha"] + break + + return head_sha + + def get_last_daily_ci_artifacts(artifact_names, output_dir, token): """Get the artifacts of last completed workflow run id of the scheduled (daily) CI.""" workflow_run_id = get_last_daily_ci_runs(token) diff --git a/utils/notification_service.py b/utils/notification_service.py index 26eb2973213e78..629b793337889a 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -539,11 +539,36 @@ def payload(self) -> str: ) url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/new_model_failures.txt" + # extra processing to save to json format + new_failed_tests = {} + for line in failure_text.split(): + if "https://github.com/huggingface/transformers/actions/runs" in line: + pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>" + items = re.findall(pattern, line) + elif "tests/models/" in line: + model = line.split("/")[2] + new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} + for url, device in items: + new_failed_tests[model][f"{device}-gpu"].append(line) + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") + with open(file_path, "w", encoding="UTF-8") as fp: + json.dump(new_failed_tests, fp, ensure_ascii=False, indent=4) + + # upload results to Hub dataset + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") + _ = api.upload_file( + path_or_fileobj=file_path, + path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/new_model_failures.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + block = { "type": "section", "text": { "type": "plain_text", - "text": "bonjour", + "text": " ", }, "accessory": { "type": "button", diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py new file mode 100644 index 00000000000000..f61f1b106644aa --- /dev/null +++ b/utils/process_bad_commit_report.py @@ -0,0 +1,77 @@ +"""An internal script to process `new_model_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`. + +This is used by `.github/workflows/check_failed_model_tests.yml` to produce a slack report of the following form + +``` +<{url}|New failed tests> +{ + "GH_ydshieh": { + "vit": 1 + } +} +``` +""" + +import datetime +import json +import os +from collections import Counter +from copy import deepcopy + +from huggingface_hub import HfApi + + +if __name__ == "__main__": + api = HfApi() + + with open("new_model_failures_with_bad_commit.json") as fp: + data = json.load(fp) + + # TODO: extend + team_members = ["ydshieh", "zucchini-nlp", "ArthurZucker", "gante", "LysandreJik", "molbap", "qubvel"] + + # Counting the number of failures grouped by authors + new_data = {} + for model, model_result in data.items(): + for device, failed_tests in model_result.items(): + for failed_test in failed_tests: + author = failed_test["author"] + + if author not in team_members: + author = failed_test["merged_by"] + + if author not in new_data: + new_data[author] = Counter() + new_data[author].update([model]) + for author in new_data: + new_data[author] = dict(new_data[author]) + + # Group by author + new_data_full = {author: deepcopy(data) for author in new_data} + for author, _data in new_data_full.items(): + for model, model_result in _data.items(): + for device, failed_tests in model_result.items(): + failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author] + model_result[device] = failed_tests + + # Upload to Hub and get the url + with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: + json.dump(new_data_full, fp, ensure_ascii=False, indent=4) + commit_info = api.upload_file( + path_or_fileobj="new_model_failures_with_bad_commit_grouped_by_authors.json", + path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json" + + # Add `GH_` prefix as keyword mention + output = {} + for author, item in new_data.items(): + author = f"GH_{author}" + output[author] = item + + report = f"<{url}|New failed tests>\\n\\n" + report += json.dumps(output, indent=4).replace('"', '\\"').replace("\n", "\\n") + print(report) From b57c7bce21799bcc964c0ab56002f27485be7a13 Mon Sep 17 00:00:00 2001 From: Christopher McGirr <7071833+chrsmcgrr@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:41:55 +0200 Subject: [PATCH 038/385] fix(Wav2Vec2ForCTC): torch export (#34023) * fix(Wav2Vec2ForCTC): torch export Resolves the issue described in #34022 by implementing the masking of the hidden states using an elementwise multiplication rather than indexing with assignment. The torch.export functionality seems to mark the tensor as frozen even though the update is legal. This change is a workaround for now to allow the export of the model as a FxGraph. Further investigation is required to find the real solution in pytorch. * [run-slow] hubert, unispeech, unispeech_sat, wav2vec2 --- src/transformers/models/hubert/modeling_hubert.py | 2 +- src/transformers/models/unispeech/modeling_unispeech.py | 2 +- src/transformers/models/unispeech_sat/modeling_unispeech_sat.py | 2 +- src/transformers/models/wav2vec2/modeling_wav2vec2.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index ad21d768e35c80..57f59cf9aab94f 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1040,7 +1040,7 @@ def forward( if attention_mask is not None: # make sure padded tokens are not attended to expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) - hidden_states[~expand_attention_mask] = 0 + hidden_states = hidden_states * expand_attention_mask.to(dtype=hidden_states.dtype) if self._use_flash_attention_2: # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index eab23032475c40..52ba08f5d4eda5 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1076,7 +1076,7 @@ def forward( if attention_mask is not None: # make sure padded tokens are not attended to expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) - hidden_states[~expand_attention_mask] = 0 + hidden_states = hidden_states * expand_attention_mask.to(dtype=hidden_states.dtype) if self._use_flash_attention_2: # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 31d5071dbe24bf..52d82ea739426b 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1093,7 +1093,7 @@ def forward( if attention_mask is not None: # make sure padded tokens are not attended to expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) - hidden_states[~expand_attention_mask] = 0 + hidden_states = hidden_states * expand_attention_mask.to(dtype=hidden_states.dtype) if self._use_flash_attention_2: # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 2648722111d52e..bf1bb7746ce802 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1109,7 +1109,7 @@ def forward( if attention_mask is not None: # make sure padded tokens are not attended to expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) - hidden_states[~expand_attention_mask] = 0 + hidden_states = hidden_states * expand_attention_mask.to(dtype=hidden_states.dtype) if self._use_flash_attention_2: # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None From f2846ad2b73e1c6b17671b9111276a2ae88d1791 Mon Sep 17 00:00:00 2001 From: Sebastian Schoennenbeck Date: Thu, 17 Oct 2024 16:45:07 +0200 Subject: [PATCH 039/385] Fix for tokenizer.apply_chat_template with continue_final_message=True (#34214) * Strip final message * Do full strip instead of rstrip * Retrigger CI --------- Co-authored-by: Matt --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 438ef1c8a4a5e2..b52a93ae94841b 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1874,7 +1874,7 @@ def apply_chat_template( **template_kwargs, ) if continue_final_message: - final_message = chat[-1]["content"] + final_message = chat[-1]["content"].strip() rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)].rstrip() rendered.append(rendered_chat) From 7f5088503fb440cb3bb2d610f892e2ee547982b3 Mon Sep 17 00:00:00 2001 From: Name Date: Thu, 17 Oct 2024 17:27:34 +0200 Subject: [PATCH 040/385] removes decord (#33987) * removes decord dependency optimize np Revert "optimize" This reverts commit faa136b51ec4ec5858e5b0ae40eb7ef89a88b475. helpers as documentation pydoc missing keys * make fixup * require_av --------- Co-authored-by: ad --- docker/transformers-all-latest-gpu/Dockerfile | 2 +- setup.py | 3 +- src/transformers/__init__.py | 2 - src/transformers/dependency_versions_table.py | 1 - .../models/git/convert_git_to_pytorch.py | 43 ++++++++++++++----- src/transformers/testing_utils.py | 8 ---- src/transformers/utils/__init__.py | 1 - src/transformers/utils/import_utils.py | 10 ----- tests/test_pipeline_mixin.py | 6 +-- 9 files changed, 37 insertions(+), 39 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 9c5e3c91415745..08e37ea6e1292f 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -43,7 +43,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum # For video model testing -RUN python3 -m pip install --no-cache-dir decord av==9.2.0 +RUN python3 -m pip install --no-cache-dir av==9.2.0 # Some slow tests require bnb RUN python3 -m pip install --no-cache-dir bitsandbytes diff --git a/setup.py b/setup.py index b1ffd0af638448..1846f7bf97b5d4 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,6 @@ "cookiecutter==1.7.3", "dataclasses", "datasets!=2.5.0", - "decord==0.6.0", "deepspeed>=0.9.3", "diffusers", "dill<0.3.5", @@ -313,7 +312,7 @@ def run(self): extras["torch-vision"] = deps_list("torchvision") + extras["vision"] extras["natten"] = deps_list("natten") extras["codecarbon"] = deps_list("codecarbon") -extras["video"] = deps_list("decord", "av") +extras["video"] = deps_list("av") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["tiktoken"] = deps_list("tiktoken", "blobfile") diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 236333fb1cbd37..50400ed6c4e944 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -939,7 +939,6 @@ "is_av_available", "is_bitsandbytes_available", "is_datasets_available", - "is_decord_available", "is_faiss_available", "is_flax_available", "is_keras_nlp_available", @@ -5855,7 +5854,6 @@ is_av_available, is_bitsandbytes_available, is_datasets_available, - is_decord_available, is_faiss_available, is_flax_available, is_keras_nlp_available, diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 6564e079033634..5ce23f4b7647d5 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -11,7 +11,6 @@ "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", "datasets": "datasets!=2.5.0", - "decord": "decord==0.6.0", "deepspeed": "deepspeed>=0.9.3", "diffusers": "diffusers", "dill": "dill<0.3.5", diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py index 238b8124a0cff6..2f93a6b03a65d9 100644 --- a/src/transformers/models/git/convert_git_to_pytorch.py +++ b/src/transformers/models/git/convert_git_to_pytorch.py @@ -19,6 +19,7 @@ import argparse from pathlib import Path +import av import numpy as np import requests import torch @@ -193,10 +194,27 @@ def prepare_img(model_name): def prepare_video(): - from decord import VideoReader, cpu + def read_video_pyav(container, indices): + """ + Decode the video with PyAV decoder. - # set seed for reproducability - np.random.seed(0) + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + """ + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) def sample_frame_indices(clip_len, frame_sample_rate, seg_len): """ @@ -217,16 +235,19 @@ def sample_frame_indices(clip_len, frame_sample_rate, seg_len): indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) return indices - # video clip consists of 300 frames (10 seconds at 30 FPS) - file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") - videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + # set seed for reproducibility + np.random.seed(0) - # sample 6 frames - videoreader.seek(0) - indices = sample_frame_indices(clip_len=6, frame_sample_rate=4, seg_len=len(videoreader)) - video = videoreader.get_batch(indices).asnumpy() + file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") + with av.open(file_path) as container: + # sample 6 frames + num_frames = 6 + indices = sample_frame_indices( + clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames + ) + frames = read_video_pyav(container, indices) - return video + return frames @torch.no_grad() diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 8eda45bd40efb4..2fc22551d37f1b 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -67,7 +67,6 @@ is_compressed_tensors_available, is_cv2_available, is_cython_available, - is_decord_available, is_detectron2_available, is_eetq_available, is_essentia_available, @@ -758,13 +757,6 @@ def require_spacy(test_case): return unittest.skipUnless(is_spacy_available(), "test requires spacy")(test_case) -def require_decord(test_case): - """ - Decorator marking a test that requires decord. These tests are skipped when decord isn't installed. - """ - return unittest.skipUnless(is_decord_available(), "test requires decord")(test_case) - - def require_torch_multi_gpu(test_case): """ Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 3b33127be4ba53..2876eef9ea02df 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -128,7 +128,6 @@ is_cv2_available, is_cython_available, is_datasets_available, - is_decord_available, is_detectron2_available, is_eetq_available, is_essentia_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index fbc248824a4b45..2f0cfe1d6dcec8 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -112,7 +112,6 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ # `importlib.metadata.util` doesn't work with `opencv-python-headless`. _cv2_available = importlib.util.find_spec("cv2") is not None _datasets_available = _is_package_available("datasets") -_decord_available = importlib.util.find_spec("decord") is not None _detectron2_available = _is_package_available("detectron2") # We need to check both `faiss` and `faiss-cpu`. _faiss_available = importlib.util.find_spec("faiss") is not None @@ -1173,10 +1172,6 @@ def is_ccl_available(): return _is_ccl_available -def is_decord_available(): - return _decord_available - - def is_sudachi_available(): return _sudachipy_available @@ -1547,10 +1542,6 @@ def is_liger_kernel_available(): Please note that you may need to restart your runtime after installation. """ -DECORD_IMPORT_ERROR = """ -{0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install -decord`. Please note that you may need to restart your runtime after installation. -""" CYTHON_IMPORT_ERROR = """ {0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install @@ -1612,7 +1603,6 @@ def is_liger_kernel_available(): ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)), ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)), ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)), - ("decord", (is_decord_available, DECORD_IMPORT_ERROR)), ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)), ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)), ("peft", (is_peft_available, PEFT_IMPORT_ERROR)), diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index cae285f5f15727..74bc1b8669a702 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -51,7 +51,7 @@ ) from transformers.testing_utils import ( is_pipeline_test, - require_decord, + require_av, require_pytesseract, require_timm, require_torch, @@ -722,14 +722,14 @@ def test_pipeline_translation_fp16(self): @is_pipeline_test @require_torch_or_tf @require_vision - @require_decord + @require_av def test_pipeline_video_classification(self): self.run_task_tests(task="video-classification") @is_pipeline_test @require_vision - @require_decord @require_torch + @require_av def test_pipeline_video_classification_fp16(self): self.run_task_tests(task="video-classification", torch_dtype="float16") From 9470c00042d0ff37e52a6e442970547b42d29b6c Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Thu, 17 Oct 2024 08:33:19 -0700 Subject: [PATCH 041/385] Llama3 and Llama2 are ExecuTorch compatible (#34101) Llama3_1b and Llama2_7b are ExecuTorch compatible Co-authored-by: Guang Yang --- tests/models/llama/test_modeling_llama.py | 69 +++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index d43a0fb13f367b..fe521ea410913c 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -23,6 +23,7 @@ from parameterized import parameterized from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed +from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, require_bitsandbytes, @@ -916,6 +917,74 @@ def test_compile_static_cache(self): static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) + @slow + @require_read_token + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + llama_models = { + "meta-llama/Llama-3.2-1B": [ + "Simply put, the theory of relativity states that 1) the speed of light is the same for all " + "observers, regardless of their location, and 2) the laws of physics are the same for all observers" + ], + "meta-llama/Llama-3.2-3B": [ + "Simply put, the theory of relativity states that 1. the speed of light is constant, and 2. " + "the speed of light is the fastest speed possible" + ], + "meta-llama/Llama-2-7b-hf": [ + "Simply put, the theory of relativity states that 1) the speed of light is a constant, and 2) " + "the laws of physics are the same for all", + ], + } + + for llama_model_ckp, EXPECTED_TEXT_COMPLETION in llama_models.items(): + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(llama_model_ckp, pad_token="", padding_side="right") + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = LlamaForCausalLM.from_pretrained( + llama_model_ckp, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompts = ["Simply put, the theory of relativity states that "] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + @slow @require_torch_accelerator From 1d2c29f0b32c084cf2e21890106681a4634ee41f Mon Sep 17 00:00:00 2001 From: David Chanin Date: Thu, 17 Oct 2024 09:39:04 -0600 Subject: [PATCH 042/385] Fix bus error when using GPT2 on M1 macs (#34031) There's a bug on M1 macs with transformer >= 4.43.0 and torch >= 2.1.0, where if a model has tied embeddings, then the fast loading from #31771 causes a bus error when the model is actually run. This can be solved by disabling `_supports_param_buffer_assignment` for these models. More info in comments in #33357 --- src/transformers/models/gpt2/modeling_gpt2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index b0c0f2c378b4ff..3e0c52e89a93d5 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -890,6 +890,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): GPT2_START_DOCSTRING, ) class GPT2Model(GPT2PreTrainedModel): + _supports_param_buffer_assignment = False + def __init__(self, config): super().__init__(config) From f51ac9e059a78049362803c1d606a2c6a8160ee4 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 17 Oct 2024 16:53:48 +0100 Subject: [PATCH 043/385] Generate: visit non-llm `prepare_inputs_for_generation` (#34199) * tmp * all visited * test all * Update src/transformers/models/moshi/modeling_moshi.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * delete another one :D --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/generation/utils.py | 30 ++-- src/transformers/models/bark/modeling_bark.py | 1 + .../modeling_bigbird_pegasus.py | 17 -- .../models/bloom/modeling_bloom.py | 1 + .../models/chameleon/modeling_chameleon.py | 3 + .../models/codegen/modeling_codegen.py | 1 + .../models/cohere/modeling_cohere.py | 1 + src/transformers/models/dbrx/modeling_dbrx.py | 1 + .../modeling_encoder_decoder.py | 17 +- .../models/falcon/modeling_falcon.py | 1 + src/transformers/models/fuyu/modeling_fuyu.py | 2 + .../models/gemma/modeling_gemma.py | 1 + .../models/gemma2/modeling_gemma2.py | 1 + src/transformers/models/git/modeling_git.py | 2 + .../models/gpt_neo/modeling_gpt_neo.py | 1 + .../models/gpt_neox/modeling_gpt_neox.py | 1 + .../modeling_gpt_neox_japanese.py | 1 + src/transformers/models/gptj/modeling_gptj.py | 1 + .../models/granite/modeling_granite.py | 1 + .../models/granitemoe/modeling_granitemoe.py | 1 + .../models/idefics/modeling_idefics.py | 3 + .../models/idefics2/modeling_idefics2.py | 3 + .../models/idefics3/modeling_idefics3.py | 3 + .../models/jetmoe/modeling_jetmoe.py | 1 + .../models/kosmos2/modeling_kosmos2.py | 2 + .../models/llama/modeling_llama.py | 1 + .../models/llava/modeling_llava.py | 2 + .../models/llava_next/modeling_llava_next.py | 2 + .../modeling_llava_next_video.py | 2 + .../modular_llava_next_video.py | 2 + .../modeling_llava_onevision.py | 2 + .../models/mistral/modeling_mistral.py | 72 -------- .../models/mixtral/modeling_mixtral.py | 70 -------- .../models/mllama/modeling_mllama.py | 3 + .../models/moshi/modeling_moshi.py | 154 ++---------------- .../models/musicgen/modeling_musicgen.py | 2 + .../modeling_musicgen_melody.py | 2 + .../models/nemotron/modeling_nemotron.py | 1 + src/transformers/models/olmo/modeling_olmo.py | 1 + .../models/olmoe/modeling_olmoe.py | 1 + .../models/paligemma/modeling_paligemma.py | 27 +-- .../models/persimmon/modeling_persimmon.py | 1 + src/transformers/models/phi/modeling_phi.py | 1 + src/transformers/models/phi3/modeling_phi3.py | 67 ++------ .../models/phimoe/modeling_phimoe.py | 67 ++------ .../models/pix2struct/modeling_pix2struct.py | 43 ----- .../models/pop2piano/modeling_pop2piano.py | 27 --- .../models/qwen2/modeling_qwen2.py | 73 --------- .../qwen2_audio/modeling_qwen2_audio.py | 7 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 73 --------- .../models/qwen2_vl/modeling_qwen2_vl.py | 2 + .../seamless_m4t/modeling_seamless_m4t.py | 132 --------------- .../modeling_seamless_m4t_v2.py | 114 ------------- .../modeling_speech_encoder_decoder.py | 17 +- .../speech_to_text/modeling_speech_to_text.py | 27 --- .../models/speecht5/modeling_speecht5.py | 2 + .../models/stablelm/modeling_stablelm.py | 1 + .../models/starcoder2/modeling_starcoder2.py | 73 --------- src/transformers/models/udop/modeling_udop.py | 30 ---- .../video_llava/modeling_video_llava.py | 2 + .../models/vipllava/modeling_vipllava.py | 2 + .../modeling_vision_encoder_decoder.py | 17 +- .../models/whisper/modeling_whisper.py | 45 +---- utils/check_copies.py | 10 +- 64 files changed, 140 insertions(+), 1134 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 86ea702dd9f2fe..9ede527ecb7b80 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -390,13 +390,16 @@ def prepare_inputs_for_generation( # 3. Prepare base model inputs input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and not self.config.is_encoder_decoder and cache_position[0] == 0: - model_inputs[input_ids_key] = None - model_inputs["inputs_embeds"] = inputs_embeds + if not self.config.is_encoder_decoder: + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs[input_ids_key] = None + model_inputs["inputs_embeds"] = inputs_embeds + else: + # `clone` calls in this function ensure a consistent stride. See #32227 + model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format) + model_inputs["inputs_embeds"] = None else: - # `clone` calls in this function ensure a consistent stride. See #32227 model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format) - model_inputs["inputs_embeds"] = None # 4. Create missing `position_ids` on the fly if ( @@ -428,10 +431,15 @@ def prepare_inputs_for_generation( # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create # the 4D causal mask exists, it should be present in the base model (XXXModel class). - base_model = getattr(self, self.base_model_prefix) - causal_mask_creation_function = getattr( - base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None - ) + base_model = getattr(self, self.base_model_prefix, None) + if base_model is None: + causal_mask_creation_function = getattr( + self, "_prepare_4d_causal_attention_mask_with_cache_position", None + ) + else: + causal_mask_creation_function = getattr( + base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None + ) if causal_mask_creation_function is None: logger.warning_once( f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method " @@ -444,10 +452,12 @@ def prepare_inputs_for_generation( attention_mask, sequence_length=sequence_length, target_length=past_key_values.get_max_cache_shape(), - dtype=self.get_output_embeddings().weight.dtype, + dtype=self.dtype, device=device, cache_position=cache_position, batch_size=batch_size, + config=self.config, + past_key_values=past_key_values, ) if attention_mask is not None: model_inputs["attention_mask"] = attention_mask diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index ea420482379d7a..f1c77367e5beb7 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -578,6 +578,7 @@ def set_input_embeddings(self, new_embeddings): self.input_embeds_layer = new_embeddings def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + # Overwritten -- bark has a model-specific hack input_embeds = kwargs.get("input_embeds", None) attention_mask = kwargs.get("attention_mask", None) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 19540a7498f5bd..520e7dab1f119d 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -3020,23 +3020,6 @@ def forward( cross_attentions=outputs.cross_attentions, ) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs - ): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - if past_key_values: - input_ids = input_ids[:, -1:] - # first step, decoder_cached_states are empty - return { - "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 75f8e5830f44bd..b0e9a4bbcb91bd 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -806,6 +806,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 20dbfc317e133d..d0b964a7a6f484 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1451,6 +1451,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1644,6 +1645,8 @@ def prepare_inputs_for_generation( use_cache=True, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 478745b2c59ea4..616c93a46e4f4a 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -649,6 +649,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index a5d3721f5bdb03..3c14a6d28dee54 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -1017,6 +1017,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index ef81e43d0294f0..659fa154ecf776 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1179,6 +1179,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 359a4eabcf7b3d..d1029160dd0cc2 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -26,6 +26,7 @@ from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -166,7 +167,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start @add_start_docstrings(ENCODER_DECODER_START_DOCSTRING) -class EncoderDecoderModel(PreTrainedModel): +class EncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the @@ -666,20 +667,6 @@ def forward( def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the" diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index f48accab44bfc2..504dcf10b206c3 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1188,6 +1188,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index d2fd197073c032..c8c758e6888a59 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -344,6 +344,8 @@ def prepare_inputs_for_generation( image_patches_indices=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if past_key_values: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index ff206a470bc3fa..f164c4add1fb5e 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -933,6 +933,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 0b99aa59c65b41..8f7e7364b54c95 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -905,6 +905,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index c7f9ceafe19452..0b86a41378fe0f 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -1609,6 +1609,8 @@ def forward( def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs ): + # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm + # cut decoder_input_ids if past_key_values is used if past_key_values is not None: past_length = past_key_values.get_seq_length() diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 7bba7608e6c187..28bfbabc1fd8e0 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -863,6 +863,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index f4636db0a97b44..359996983eed74 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -1060,6 +1060,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index b618f531e52f66..6c3f3313f57faf 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -764,6 +764,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 5c80485823c10b..1cc9cf369d1887 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -958,6 +958,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 0eb27d452f08d2..bb8c157df30c89 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -958,6 +958,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index ebdea826fa0450..f3e2d67734a703 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1194,6 +1194,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 4dd5f36a93e166..bc983744559fc9 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1441,6 +1441,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1674,6 +1675,8 @@ def prepare_inputs_for_generation( use_cache=None, **kwargs, ): + # Overwritten -- custom processing based on `config.use_resampler` + model_inputs = {} if image_hidden_states is not None: if self.config.use_resampler: diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index d34e0acde4c814..daa8bfb055b561 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1665,6 +1665,9 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take + # precedence is moved to the model, we can remove this fn) + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: if inputs_embeds is not None: # Exception 1 diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index e653fd3d2a6ba2..fb9f0a7c58fa5a 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1256,6 +1256,9 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take + # precedence is moved to the model, we can remove this fn) + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: if inputs_embeds is not None: # Exception 1 diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index bbc70b26d1f8a9..805c82be3881bc 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1160,6 +1160,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 7674e29db6b915..ffd8277f0268a3 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1696,6 +1696,8 @@ def prepare_inputs_for_generation( use_cache=None, **model_kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index dde017bbb92797..40db21aeaea7d1 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -1053,6 +1053,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 411b96f5c57a50..31593bc62d098c 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -590,6 +590,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # Trigger the new behavior if we have more than image embeddings seq length tokens for images legacy_processing = ( input_ids is not None diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 75dfcf5393ea15..03ab28dfff9cb1 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -968,6 +968,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + legacy_processing = ( input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 30257b84397814..3fd6bb47fc7661 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1057,6 +1057,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- extra custom processing + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e7de66de444af7..ec5a05733ec878 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -572,6 +572,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- extra custom processing + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 3eefb517b16d9f..7bacd2a54fc97f 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -728,6 +728,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 1bb7a3f109ef9b..3f26b5fe03d9f1 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -1129,78 +1129,6 @@ def forward( attentions=outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 9bb0654f030ba7..9248ad2187c38a 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1367,76 +1367,6 @@ def forward( router_logits=outputs.router_logits, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - output_router_logits=False, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "output_router_logits": output_router_logits, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index e486e149e3e660..b8d2879612aad2 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1785,6 +1785,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -2171,6 +2172,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 5746a5934bd31f..97200b7d042e61 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1400,90 +1400,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position( ) return causal_mask - def prepare_inputs_for_generation( - self, - input_ids: torch.LongTensor, - past_key_values: Optional[Cache] = None, - attention_mask: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - cache_position: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - use_cache: bool = True, - num_logits_to_keep: Optional[int] = None, - **kwargs, - ): - """ - Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or - slicing inputs given the existing cache. - See the documentation in the used model for the arguments (different models might have different requirements - for e.g. `past_key_values`). Should work as is for most LLMs. - """ - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s - # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the - # decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, - # `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create - # the 4D causal mask exists, it should be present in the base model (XXXModel class). - attention_mask = self._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.max_cache_len, - dtype=self.text_embed_tokens.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "last_hidden_state": kwargs.get("last_hidden_state"), - } - ) - return model_inputs - @add_start_docstrings( "The bare Moshi Model outputting raw hidden-states without any specific head on top.", @@ -2492,66 +2408,18 @@ def prepare_inputs_for_generation( blank_user_audio_codes: Optional[torch.FloatTensor] = None, **kwargs, ): + # Overwritten -- Moshi has custom post-processing # 1. Do usual operations done on LLMs like Gemma - because we pre-processed inputs, the first pass always has inputs_embeds - - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - dtype = self.decoder.dtype - - attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.max_cache_len, - dtype=dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) # 2. Now that everything is prepared, generate audio_codes using the depth decoder diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 8d7f6ad3c7c682..626097f5c7cbcc 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1345,6 +1345,7 @@ def prepare_inputs_for_generation( guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if delay_pattern_mask is None: input_ids, delay_pattern_mask = self.build_delay_pattern_mask( input_ids, @@ -2180,6 +2181,7 @@ def prepare_inputs_for_generation( guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if decoder_delay_pattern_mask is None: decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask( decoder_input_ids, diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 96b8d29db83da1..166623796d65d0 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1254,6 +1254,7 @@ def prepare_inputs_for_generation( guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if delay_pattern_mask is None: input_ids, delay_pattern_mask = self.build_delay_pattern_mask( input_ids, @@ -2058,6 +2059,7 @@ def prepare_inputs_for_generation( guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if decoder_delay_pattern_mask is None: decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask( decoder_input_ids, diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 7d0390adc3c06f..aa2fd93fbe916a 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -931,6 +931,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 7ab54146c9740b..ff45fffb6e7396 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -973,6 +973,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 8c29f89ff3e7ea..d9d4a9771cd79d 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1131,6 +1131,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index d75a05bda0e1ec..0eb2d50e0ad4c4 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -59,6 +59,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( batch_size: int, is_training: bool = False, token_type_ids: torch.Tensor = None, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -572,6 +573,7 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- custom `position_ids` and `pixel_values` handling model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -581,33 +583,10 @@ def prepare_inputs_for_generation( cache_position=cache_position, use_cache=use_cache, num_logits_to_keep=num_logits_to_keep, + token_type_ids=token_type_ids, **kwargs, ) - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - dtype = self.get_output_embeddings().weight.dtype - min_dtype = torch.finfo(dtype).min - - model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_length(), - dtype=dtype, - device=device, - min_dtype=min_dtype, - cache_position=cache_position, - batch_size=batch_size, - ) - - model_inputs["token_type_ids"] = token_type_ids - # position_ids in Paligemma are 1-indexed if model_inputs.get("position_ids") is not None: model_inputs["position_ids"] += 1 diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 7ae3469a4c9399..61d8b1002f3c15 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -800,6 +800,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 3f770c9ec00b9b..807b3fef4f44a0 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1091,6 +1091,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 0380c6cd49d6ea..a0b4b2ec378e32 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1351,63 +1351,16 @@ def prepare_inputs_for_generation( if past_length <= self.config.original_max_position_embeddings: past_key_values = None - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) return model_inputs diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index d1705f04ddb7bb..1da65d7d39be4b 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1541,63 +1541,16 @@ def prepare_inputs_for_generation( if past_length <= self.config.original_max_position_embeddings: past_key_values = None - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) return model_inputs diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index f209d7d8828785..37090677a6e254 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1739,46 +1739,3 @@ def forward( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) - - def prepare_inputs_for_generation( - self, - input_ids, - flattened_patches: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, - past_key_values=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - if decoder_attention_mask is None: - decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device) - - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "flattened_patches": flattened_patches, - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index e6488898e8a938..d6f92e9fe03495 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -1299,33 +1299,6 @@ def generate( **kwargs, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - input_ids = input_ids[:, -1:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 2585352fc9594d..2e59ebd5eb98d1 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1229,79 +1229,6 @@ def forward( attentions=outputs.attentions, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 6422baac5feb5e..e923e535da8e34 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -1253,16 +1253,17 @@ def forward( attention_mask=attention_mask, ) - # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.prepare_inputs_for_generation with image->audio def prepare_inputs_for_generation( self, input_ids, past_key_values=None, inputs_embeds=None, - input_features=None, # Ignore copy + input_features=None, attention_mask=None, **kwargs, ): + # Overwritten -- custom processing (note: might not be needed, but there are no generation tests running atm) + if past_key_values is not None: if isinstance(past_key_values, Cache): cache_length = past_key_values.get_seq_length() @@ -1270,7 +1271,6 @@ def prepare_inputs_for_generation( else: cache_length = past_length = past_key_values[0][0].shape[2] - # Ignore copy # Here, we get the attention_mask, which was previously stored in the state after _merge_input_ids_with_audio_features. if input_features is not None and kwargs.get("attention_mask") is not None: attention_mask = kwargs["attention_mask"] @@ -1310,7 +1310,6 @@ def prepare_inputs_for_generation( else: model_inputs = {"input_ids": input_ids} - # Ignore copy feature_attention_mask = kwargs.get("feature_attention_mask", None) model_inputs.update( { diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 1a5f6e2ff2fbdc..1e741b4a9e3e57 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1433,79 +1433,6 @@ def forward( router_logits=outputs.router_logits, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index e014a6da6bb3bc..5464b40546498a 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1803,6 +1803,8 @@ def prepare_inputs_for_generation( video_grid_thw=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index adc01ec40f9599..c5c3b202846705 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -2264,28 +2264,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id) @@ -2917,28 +2895,6 @@ def generate( **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3209,28 +3165,6 @@ def generate( **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3560,28 +3494,6 @@ def generate( return waveform, waveform_lengths - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3931,28 +3843,6 @@ def _reorder_cache(past_key_values, beam_idx): ) return reordered_past - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @add_start_docstrings( "The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).", @@ -4385,28 +4275,6 @@ def generate( return waveform, waveform_lengths - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 21265faa225127..a8068eb0ad01ea 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -3175,28 +3175,6 @@ def generate( **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3477,29 +3455,6 @@ def generate( **kwargs, ) - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._reorder_cache def _reorder_cache(past_key_values, beam_idx): @@ -3871,29 +3826,6 @@ def generate( return waveform, waveform_lengths - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._reorder_cache def _reorder_cache(past_key_values, beam_idx): @@ -4285,29 +4217,6 @@ def _reorder_cache(past_key_values, beam_idx): ) return reordered_past - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @add_start_docstrings( "The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).", @@ -4786,29 +4695,6 @@ def generate( return waveform, waveform_lengths - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._reorder_cache def _reorder_cache(past_key_values, beam_idx): diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index ef84a4fa5fbd2a..a1caa7cf6da2f7 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -21,6 +21,7 @@ from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -169,7 +170,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start @add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING) -class SpeechEncoderDecoderModel(PreTrainedModel): +class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`SpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the @@ -574,20 +575,6 @@ def forward( def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the SpeechEncoderDecoderModel directly is not supported. Please use the" diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index bdd532fa25e82a..aadc1da500ea64 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1331,33 +1331,6 @@ def forward( encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index dbe57c01d9839e..63b536d185a379 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -2425,6 +2425,8 @@ def prepare_inputs_for_generation( encoder_outputs=None, **kwargs, ): + # Note that this model doesn't inherit from the generation mixin, has unique generate function + # cut decoder_input_ids if past is used if past_key_values is not None: past_length = past_key_values[0][0].shape[2] diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index fe3ad6498172a9..9b445596f6578f 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1075,6 +1075,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index e0fdbef1a3baf5..66d36d6db7ce7b 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1204,79 +1204,6 @@ def forward( attentions=outputs.attentions, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index c621b742323db2..6be8752d5b63b0 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1869,36 +1869,6 @@ def forward( encoder_attentions=encoder_outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - input_ids = input_ids[:, -1:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - "bbox": kwargs.get("bbox", None), - "pixel_values": kwargs.get("pixel_values", None), - "visual_bbox": kwargs.get("visual_bbox", None), - } - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache def _reorder_cache(self, past_key_values, beam_idx): # if decoder past is not included in output diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 20fa0166b80c9c..c9703d263e7d20 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -707,6 +707,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 76348228476757..3af32a9caace0e 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -581,6 +581,8 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # Trigger the new behavior if we have more than image embeddings seq length tokens for images legacy_processing = ( input_ids is not None diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 0c3cd95adbf878..b044dda300ab48 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -24,6 +24,7 @@ from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -147,7 +148,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start @add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING) -class VisionEncoderDecoderModel(PreTrainedModel): +class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base vision model classes of the library as encoder and another one as decoder when created with the @@ -654,20 +655,6 @@ def forward( def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the" diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 079965fc174a63..ce3df3e16707e5 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1442,6 +1442,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1817,6 +1818,10 @@ def prepare_inputs_for_generation( cache_position=None, **kwargs, ): + # Overwritten -- encoder-decoder whisper has custom logic, but it's close to the general function. Next time + # this function needs to be touched, let's try to sort out the commonalities between the two and remove the + # overwrite. + decoder_position_ids = None if decoder_attention_mask is not None: decoder_position_ids = (decoder_attention_mask.cumsum(-1) - 1).clamp(min=0) @@ -2092,46 +2097,6 @@ def forward( cross_attentions=outputs.cross_attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - use_cache=None, - encoder_outputs=None, - attention_mask=None, - cache_position=None, - **kwargs, - ): - past_length = 0 - if past_key_values is not None: - if isinstance(past_key_values, (Cache, EncoderDecoderCache)): - past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length() - else: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - if cache_position is None: - cache_position = torch.arange(past_length, past_length + input_ids.shape[1], device=input_ids.device) - elif use_cache: - cache_position = cache_position[-input_ids.shape[1] :] - - return { - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "input_ids": input_ids, - "use_cache": use_cache, - "attention_mask": attention_mask, - "cache_position": cache_position, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/utils/check_copies.py b/utils/check_copies.py index 4bb5c6fef4eeb7..ed6a4f68b577f0 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -672,9 +672,13 @@ def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = No indent, object_name, replace_pattern = search.groups() # Find the file lines, the object's code, and its blocks - target_lines, theoretical_code, theoretical_code_splits = find_code_and_splits( - object_name, base_path, buffer=buffer - ) + try: + target_lines, theoretical_code, theoretical_code_splits = find_code_and_splits( + object_name, base_path, buffer=buffer + ) + except Exception as exc: + exc.args = (f"Error while trying to find source code for {filename}.\n\n" + str(exc),) + raise # code replaced by the patterns theoretical_code_blocks = OrderedDict() From c1c7e89620b43f0924148e267a6da5d38450ce1f Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 17 Oct 2024 22:34:40 +0200 Subject: [PATCH 044/385] Fix Gradient Accumulation issue (#34191) * quick fix * 3 losses * oups * fix * nits * check how it scales for special models * propagate for conditiona detr * propagate * propagate * propagate * fixes * propagate changes * update * fixup * nits * f string * fixes * more fixes * ? * nit * arg annoying f string * nits * grumble * update * nit * refactor * fix fetch tests * nit * nit * Update src/transformers/loss/loss_utils.py Co-authored-by: Kashif Rasul * update * nit * fixup * make pass * nits * port code to more models * fixup * ntis * arf * update * update * nits * update * fix * update * nits * fine * agjkfslga.jsdlkgjklas * nits * fix fx? * update * update * styel * fix imports * update * update * fixup to fix the torch fx? --------- Co-authored-by: Kashif Rasul --- src/transformers/__init__.py | 2 + src/transformers/configuration_utils.py | 3 + src/transformers/loss/__init__.py | 13 + src/transformers/loss/loss_deformable_detr.py | 178 +++++ .../loss/loss_for_object_detection.py | 562 +++++++++++++++ src/transformers/loss/loss_rt_detr.py | 463 +++++++++++++ src/transformers/loss/loss_utils.py | 114 +++ src/transformers/modeling_utils.py | 25 +- .../models/cohere/modeling_cohere.py | 14 +- .../modeling_conditional_detr.py | 592 +--------------- .../modeling_deformable_detr.py | 533 +------------- src/transformers/models/detr/modeling_detr.py | 583 +--------------- .../models/gemma/modeling_gemma.py | 40 +- .../models/gemma/modular_gemma.py | 14 +- .../models/gemma2/modeling_gemma2.py | 40 +- .../models/gemma2/modular_gemma2.py | 14 +- .../grounding_dino/modeling_grounding_dino.py | 480 +------------ .../models/jamba/modeling_jamba.py | 37 +- .../models/jetmoe/modeling_jetmoe.py | 25 +- .../models/llama/modeling_llama.py | 63 +- .../models/mistral/modeling_mistral.py | 28 +- .../models/mixtral/modeling_mixtral.py | 40 +- .../models/mllama/modeling_mllama.py | 14 +- .../models/nemotron/modeling_nemotron.py | 62 +- src/transformers/models/olmo/modeling_olmo.py | 14 +- .../models/olmoe/modeling_olmoe.py | 14 +- .../models/owlv2/modeling_owlv2.py | 8 +- .../models/owlvit/modeling_owlvit.py | 8 +- .../models/persimmon/modeling_persimmon.py | 28 +- src/transformers/models/phi/modeling_phi.py | 38 +- src/transformers/models/phi3/modeling_phi3.py | 38 +- .../models/phimoe/modeling_phimoe.py | 37 +- .../models/qwen2/modeling_qwen2.py | 16 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 40 +- .../models/rt_detr/modeling_rt_detr.py | 651 +----------------- .../models/stablelm/modeling_stablelm.py | 28 +- .../models/starcoder2/modeling_starcoder2.py | 28 +- .../modeling_table_transformer.py | 503 +------------- .../models/yolos/modeling_yolos.py | 543 +-------------- .../models/zamba/modeling_zamba.py | 13 +- utils/check_config_attributes.py | 51 ++ 41 files changed, 1652 insertions(+), 4345 deletions(-) create mode 100644 src/transformers/loss/__init__.py create mode 100644 src/transformers/loss/loss_deformable_detr.py create mode 100644 src/transformers/loss/loss_for_object_detection.py create mode 100644 src/transformers/loss/loss_rt_detr.py create mode 100644 src/transformers/loss/loss_utils.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 50400ed6c4e944..e48b2599d4c298 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -142,7 +142,9 @@ "is_tensorboard_available", "is_wandb_available", ], + "loss": [], "modelcard": ["ModelCard"], + # Losses "modeling_tf_pytorch_utils": [ "convert_tf_weight_name_to_pt_weight_name", "load_pytorch_checkpoint_in_tf2_model", diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 3ea48f6ab7f707..8bc08ca625961e 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -184,6 +184,9 @@ class PretrainedConfig(PushToHubMixin): Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers v5. + loss_type (`str`, *optional*): + The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will + be automatically infered from the model architecture. """ model_type: str = "" diff --git a/src/transformers/loss/__init__.py b/src/transformers/loss/__init__.py new file mode 100644 index 00000000000000..196860c9f1c605 --- /dev/null +++ b/src/transformers/loss/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/transformers/loss/loss_deformable_detr.py b/src/transformers/loss/loss_deformable_detr.py new file mode 100644 index 00000000000000..62080bcb3fd94f --- /dev/null +++ b/src/transformers/loss/loss_deformable_detr.py @@ -0,0 +1,178 @@ +import torch +import torch.nn as nn + +from ..image_transforms import center_to_corners_format +from ..utils import is_scipy_available +from .loss_for_object_detection import ( + HungarianMatcher, + ImageLoss, + _set_aux_loss, + generalized_box_iou, + sigmoid_focal_loss, +) + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + + +class DeformableDetrHungarianMatcher(HungarianMatcher): + @torch.no_grad() + def forward(self, outputs, targets): + """ + Differences: + - out_prob = outputs["logits"].flatten(0, 1).sigmoid() instead of softmax + - class_cost uses alpha and gamma + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +class DeformableDetrImageLoss(ImageLoss): + def __init__(self, matcher, num_classes, focal_alpha, losses): + nn.Module.__init__(self) + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} + + return losses + + +def DeformableDetrForSegmentationLoss( + logits, labels, device, pred_boxes, pred_masks, config, outputs_class=None, outputs_coord=None, **kwargs +): + # First: create the matcher + matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality", "masks"] + criterion = DeformableDetrImageLoss( + matcher=matcher, + num_classes=config.num_labels, + focal_alpha=config.focal_alpha, + losses=losses, + ) + criterion.to(device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + outputs_loss["pred_masks"] = pred_masks + + auxiliary_outputs = None + if config.auxiliary_loss: + auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient} + weight_dict["loss_giou"] = config.giou_loss_coefficient + weight_dict["loss_mask"] = config.mask_loss_coefficient + weight_dict["loss_dice"] = config.dice_loss_coefficient + if config.auxiliary_loss: + aux_weight_dict = {} + for i in range(config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + return loss, loss_dict, auxiliary_outputs + + +def DeformableDetrForObjectDetectionLoss( + logits, labels, device, pred_boxes, config, outputs_class=None, outputs_coord=None, **kwargs +): + # First: create the matcher + matcher = DeformableDetrHungarianMatcher( + class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = DeformableDetrImageLoss( + matcher=matcher, + num_classes=config.num_labels, + focal_alpha=config.focal_alpha, + losses=losses, + ) + criterion.to(device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + auxiliary_outputs = None + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if config.auxiliary_loss: + auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient} + weight_dict["loss_giou"] = config.giou_loss_coefficient + if config.auxiliary_loss: + aux_weight_dict = {} + for i in range(config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + return loss, loss_dict, auxiliary_outputs diff --git a/src/transformers/loss/loss_for_object_detection.py b/src/transformers/loss/loss_for_object_detection.py new file mode 100644 index 00000000000000..b820f6daed1224 --- /dev/null +++ b/src/transformers/loss/loss_for_object_detection.py @@ -0,0 +1,562 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from ..utils import is_accelerate_available, is_scipy_available, is_vision_available, requires_backends + + +if is_accelerate_available(): + from accelerate import PartialState + from accelerate.utils import reduce + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + + +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class ImageLoss(nn.Module): + """ + This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1) + we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair + of matched ground-truth / prediction (supervise class and box). + + A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes` + parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is + the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to + be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2 + (`max_obj_id` + 1). For more details on this, check the following discussion + https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223" + + + Args: + matcher (`DetrHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + eos_coef (`float`): + Relative classification weight applied to the no-object category. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__(self, matcher, num_classes, eos_coef, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim + [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the masks: the focal loss and the dice loss. + + Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. + """ + if "pred_masks" not in outputs: + raise KeyError("No predicted masks found in outputs") + + source_idx = self._get_source_permutation_idx(indices) + target_idx = self._get_target_permutation_idx(indices) + source_masks = outputs["pred_masks"] + source_masks = source_masks[source_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(source_masks) + target_masks = target_masks[target_idx] + + # upsample predictions to the target size + source_masks = nn.functional.interpolate( + source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + ) + source_masks = source_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(source_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), + "loss_dice": dice_loss(source_masks, target_masks, num_boxes), + } + return losses + + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + "masks": self.loss_masks, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + world_size = 1 + if is_accelerate_available(): + if PartialState._shared_state != {}: + num_boxes = reduce(num_boxes) + world_size = PartialState().num_processes + num_boxes = torch.clamp(num_boxes / world_size, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + if loss == "masks": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py +class HungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + class_cost = -out_prob[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py + + +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306 +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor: + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +@torch.jit.unused +def _set_aux_loss(outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + +def ForSegmentationLoss( + logits, labels, device, pred_boxes, pred_masks, config, outputs_class=None, outputs_coord=None, **kwargs +): + # First: create the matcher + matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality", "masks"] + criterion = ImageLoss( + matcher=matcher, + num_classes=config.num_labels, + eos_coef=config.eos_coefficient, + losses=losses, + ) + criterion.to(device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + outputs_loss["pred_masks"] = pred_masks + + auxiliary_outputs = None + if config.auxiliary_loss: + auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient} + weight_dict["loss_giou"] = config.giou_loss_coefficient + weight_dict["loss_mask"] = config.mask_loss_coefficient + weight_dict["loss_dice"] = config.dice_loss_coefficient + if config.auxiliary_loss: + aux_weight_dict = {} + for i in range(config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + return loss, loss_dict, auxiliary_outputs + + +def ForObjectDetectionLoss( + logits, labels, device, pred_boxes, config, outputs_class=None, outputs_coord=None, **kwargs +): + # First: create the matcher + matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = ImageLoss( + matcher=matcher, + num_classes=config.num_labels, + eos_coef=config.eos_coefficient, + losses=losses, + ) + criterion.to(device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + auxiliary_outputs = None + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if config.auxiliary_loss: + auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient} + weight_dict["loss_giou"] = config.giou_loss_coefficient + if config.auxiliary_loss: + aux_weight_dict = {} + for i in range(config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + return loss, loss_dict, auxiliary_outputs diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py new file mode 100644 index 00000000000000..3aea87c5f5a75a --- /dev/null +++ b/src/transformers/loss/loss_rt_detr.py @@ -0,0 +1,463 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..utils import is_scipy_available, is_vision_available, requires_backends +from .loss_for_object_detection import ( + _set_aux_loss, + box_iou, + dice_loss, + generalized_box_iou, + nested_tensor_from_tensor_list, + sigmoid_focal_loss, +) + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + + +class RTDetrHungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + config: RTDetrConfig + """ + + def __init__(self, config): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = config.matcher_class_cost + self.bbox_cost = config.matcher_bbox_cost + self.giou_cost = config.matcher_giou_cost + + self.use_focal_loss = config.use_focal_loss + self.alpha = config.matcher_alpha + self.gamma = config.matcher_gamma + + if self.class_cost == self.bbox_cost == self.giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + if self.use_focal_loss: + out_prob = F.sigmoid(outputs["logits"].flatten(0, 1)) + out_prob = out_prob[:, target_ids] + neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class - neg_cost_class + else: + out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + class_cost = -out_prob[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + # Compute the giou cost betwen boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + # Compute the final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +class RTDetrLoss(nn.Module): + """ + This class computes the losses for RTDetr. The process happens in two steps: 1) we compute hungarian assignment + between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / + prediction (supervise class and box). + + Args: + matcher (`DetrHungarianMatcher`): + Module able to compute a matching between targets and proposals. + weight_dict (`Dict`): + Dictionary relating each loss with its weights. These losses are configured in RTDetrConf as + `weight_loss_vfl`, `weight_loss_bbox`, `weight_loss_giou` + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + alpha (`float`): + Parameter alpha used to compute the focal loss. + gamma (`float`): + Parameter gamma used to compute the focal loss. + eos_coef (`float`): + Relative classification weight applied to the no-object category. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + """ + + def __init__(self, config): + super().__init__() + + self.matcher = RTDetrHungarianMatcher(config) + self.num_classes = config.num_labels + self.weight_dict = { + "loss_vfl": config.weight_loss_vfl, + "loss_bbox": config.weight_loss_bbox, + "loss_giou": config.weight_loss_giou, + } + self.losses = ["vfl", "boxes"] + self.eos_coef = config.eos_coefficient + empty_weight = torch.ones(config.num_labels + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + self.alpha = config.focal_loss_alpha + self.gamma = config.focal_loss_gamma + + def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True): + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + if "logits" not in outputs: + raise KeyError("No predicted logits found in outputs") + idx = self._get_source_permutation_idx(indices) + + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([_target["boxes"][i] for _target, (_, i) in zip(targets, indices)], dim=0) + ious, _ = box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) + ious = torch.diag(ious).detach() + + src_logits = outputs["logits"] + target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_original + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_original = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_original[idx] = ious.to(target_score_original.dtype) + target_score = target_score_original.unsqueeze(-1) * target + + pred_score = F.sigmoid(src_logits).detach() + weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score + + loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none") + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {"loss_vfl": loss} + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "class_labels" containing a tensor of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + + src_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_original + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.class_weight) + losses = {"loss_ce": loss_ce} + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. This is not + really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. Targets dicts must + contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes are expected in + format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the masks: the focal loss and the dice loss. Targets dicts must contain the key + "masks" containing a tensor of dim [nb_target_boxes, h, w]. + """ + if "pred_masks" not in outputs: + raise KeyError("No predicted masks found in outputs") + + source_idx = self._get_source_permutation_idx(indices) + target_idx = self._get_target_permutation_idx(indices) + source_masks = outputs["pred_masks"] + source_masks = source_masks[source_idx] + masks = [t["masks"] for t in targets] + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(source_masks) + target_masks = target_masks[target_idx] + + # upsample predictions to the target size + source_masks = nn.functional.interpolate( + source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + ) + source_masks = source_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(source_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), + "loss_dice": dice_loss(source_masks, target_masks, num_boxes), + } + return losses + + def loss_labels_bce(self, outputs, targets, indices, num_boxes, log=True): + src_logits = outputs["logits"] + idx = self._get_source_permutation_idx(indices) + target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_original + + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + loss = F.binary_cross_entropy_with_logits(src_logits, target * 1.0, reduction="none") + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {"loss_bce": loss} + + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True): + if "logits" not in outputs: + raise KeyError("No logits found in outputs") + + src_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) + target_classes[idx] = target_classes_original + + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + loss = sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma) + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {"loss_focal": loss} + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + "masks": self.loss_masks, + "bce": self.loss_labels_bce, + "focal": self.loss_labels_focal, + "vfl": self.loss_labels_vfl, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + @staticmethod + def get_cdn_matched_indices(dn_meta, targets): + dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + num_gts = [len(t["class_labels"]) for t in targets] + device = targets[0]["class_labels"].device + + dn_match_indices = [] + for i, num_gt in enumerate(num_gts): + if num_gt > 0: + gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) + gt_idx = gt_idx.tile(dn_num_group) + assert len(dn_positive_idx[i]) == len(gt_idx) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append( + ( + torch.zeros(0, dtype=torch.int64, device=device), + torch.zeros(0, dtype=torch.int64, device=device), + ) + ) + + return dn_match_indices + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if "auxiliary_outputs" not in k} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + if loss == "masks": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f"_aux_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of cdn auxiliary losses. For rtdetr + if "dn_auxiliary_outputs" in outputs: + if "denoising_meta_values" not in outputs: + raise ValueError( + "The output must have the 'denoising_meta_values` key. Please, ensure that 'outputs' includes a 'denoising_meta_values' entry." + ) + indices = self.get_cdn_matched_indices(outputs["denoising_meta_values"], targets) + num_boxes = num_boxes * outputs["denoising_meta_values"]["dn_num_group"] + + for i, auxiliary_outputs in enumerate(outputs["dn_auxiliary_outputs"]): + # indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + if loss == "masks": + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f"_dn_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +def RTDetrForObjectDetectionLoss( + logits, + labels, + device, + pred_boxes, + config, + outputs_class=None, + outputs_coord=None, + enc_topk_logits=None, + enc_topk_bboxes=None, + denoising_meta_values=None, + **kwargs, +): + criterion = RTDetrLoss(config) + criterion.to(device) + # Second: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if config.auxiliary_loss: + if denoising_meta_values is not None: + dn_out_coord, outputs_coord = torch.split(outputs_coord, denoising_meta_values["dn_num_split"], dim=2) + dn_out_class, outputs_class = torch.split(outputs_class, denoising_meta_values["dn_num_split"], dim=2) + + auxiliary_outputs = _set_aux_loss(outputs_class[:, :-1].transpose(0, 1), outputs_coord[:, :-1].transpose(0, 1)) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + outputs_loss["auxiliary_outputs"].extend(_set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) + if denoising_meta_values is not None: + outputs_loss["dn_auxiliary_outputs"] = _set_aux_loss( + dn_out_class.transpose(0, 1), dn_out_coord.transpose(0, 1) + ) + outputs_loss["denoising_meta_values"] = denoising_meta_values + + loss_dict = criterion(outputs_loss, labels) + + loss = sum(loss_dict.values()) + return loss, loss_dict, auxiliary_outputs diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py new file mode 100644 index 00000000000000..efa23d24e360b4 --- /dev/null +++ b/src/transformers/loss/loss_utils.py @@ -0,0 +1,114 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss, MSELoss + +from .loss_deformable_detr import DeformableDetrForObjectDetectionLoss, DeformableDetrForSegmentationLoss +from .loss_for_object_detection import ForObjectDetectionLoss, ForSegmentationLoss +from .loss_rt_detr import RTDetrForObjectDetectionLoss + + +def fixed_cross_entropy(source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs): + reduction = "sum" if num_items_in_batch is not None else "mean" + loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction) + if reduction == "sum": + loss = loss / num_items_in_batch + return loss + + +def ForCausalLMLoss( + logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs +): + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + # Flatten the tokens + shift_logits = shift_logits.view(-1, vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = fixed_cross_entropy(shift_logits, shift_labels, num_items_in_batch, ignore_index, **kwargs) + return loss + + +def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs): + num_labels = config.num_labels + if config.problem_type is None: + if num_labels == 1: + config.problem_type = "regression" + elif num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + config.problem_type = "single_label_classification" + else: + config.problem_type = "multi_label_classification" + + if config.problem_type == "regression": + loss_fct = MSELoss() + if num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif config.problem_type == "single_label_classification": + loss = fixed_cross_entropy(pooled_logits.view(-1, num_labels), labels.view(-1), **kwargs) + elif config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + return loss + + +def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_positions, **kwargs): + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1).to(start_logits.device) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1).to(end_logits.device) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + start_loss = fixed_cross_entropy(start_logits, start_positions, ignore_index=ignored_index, **kwargs) + end_loss = fixed_cross_entropy(end_logits, end_positions, ignore_index=ignored_index, **kwargs) + total_loss = (start_loss + end_loss) / 2 + return total_loss + + +def ForTokenClassification(logits, labels, config, **kwargs): + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.view(-1, config.num_labels) + labels = labels.view(-1) + logits = logits.float() + # Flatten the tokens + return fixed_cross_entropy(logits, labels, **kwargs) + + +LOSS_MAPPING = { + "ForCausalLM": ForCausalLMLoss, + "ForQuestionAnswering": ForQuestionAnsweringLoss, + "ForSequenceClassification": ForSequenceClassificationLoss, + "ForTokenClassification": ForTokenClassification, + "ForSegmentation": ForSegmentationLoss, + "ForObjectDetection": ForObjectDetectionLoss, + "DeformableDetrForObjectDetection": DeformableDetrForObjectDetectionLoss, + "ConditionalDetrForObjectDetection": DeformableDetrForObjectDetectionLoss, + "GroundingDinoForObjectDetection": DeformableDetrForObjectDetectionLoss, + "ConditionalDetrForSegmentation": DeformableDetrForSegmentationLoss, + "RTDetrForObjectDetection": RTDetrForObjectDetectionLoss, +} diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index cb0d743b0a90ae..c84aec21a32663 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -28,7 +28,7 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass -from functools import partial, wraps +from functools import lru_cache, partial, wraps from threading import Thread from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from zipfile import is_zipfile @@ -45,6 +45,7 @@ from .dynamic_module_utils import custom_object_save from .generation import GenerationConfig, GenerationMixin from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled +from .loss.loss_utils import LOSS_MAPPING from .pytorch_utils import ( # noqa: F401 Conv1D, apply_chunking_to_forward, @@ -4979,6 +4980,28 @@ def _is_quantized_training_enabled(self): return self.hf_quantizer.is_trainable + @property + @lru_cache + def loss_function(self): + if getattr(self.config, "loss_type", None) is not None: + loss_type = self.config.loss_type + else: + loss_type = self.__class__.__name__ + if loss_type not in LOSS_MAPPING: + loss_groups = f"({'|'.join(LOSS_MAPPING)})" + loss_type = re.findall(loss_groups, self.__class__.__name__) + if len(loss_type) > 0: + loss_type = loss_type[0] + else: + loss_type = None + if loss_type is None or loss_type not in LOSS_MAPPING and getattr(self.config, "loss_type", None) is not None: + logger.warning_once( + f"`loss_type={loss_type}` was set in the config but it is unrecognised." + f"Using the default loss: `ForCausalLMLoss`." + ) + loss_type = "ForCausalLM" + return LOSS_MAPPING[loss_type] + PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub) if PreTrainedModel.push_to_hub.__doc__ is not None: diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 3c14a6d28dee54..3abe6ef8644500 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -28,7 +28,6 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1173,18 +1172,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index e0dcca67aefb5a..d633b92547d7db 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -29,10 +29,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_accelerate_available, - is_scipy_available, is_timm_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -41,18 +38,9 @@ from .configuration_conditional_detr import ConditionalDetrConfig -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce - -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - if is_timm_available(): from timm import create_model -if is_vision_available(): - from ...image_transforms import center_to_corners_format logger = logging.get_logger(__name__) @@ -1610,6 +1598,28 @@ def forward( ) +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr +class ConditionalDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + @add_start_docstrings( """ CONDITIONAL_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on @@ -1723,7 +1733,7 @@ def forward( reference = outputs.reference_points if return_dict else outputs[-1] reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1) - outputs_coords = [] + hs = sequence_output tmp = self.bbox_predictor(hs) tmp[..., :2] += reference_before_sigmoid @@ -1732,47 +1742,20 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = ConditionalDetrHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = ConditionalDetrLoss( - matcher=matcher, - num_classes=self.config.num_labels, - focal_alpha=self.config.focal_alpha, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: + outputs_coords = [] intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] outputs_class = self.class_labels_classifier(intermediate) - for lvl in range(intermediate.shape[0]): tmp = self.bbox_predictor(intermediate[lvl]) tmp[..., :2] += reference_before_sigmoid outputs_coord = tmp.sigmoid() outputs_coords.append(outputs_coord) outputs_coord = torch.stack(outputs_coords) - - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": self.config.cls_loss_coefficient, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -1977,43 +1960,14 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = ConditionalDetrHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality", "masks"] - criterion = ConditionalDetrLoss( - matcher=matcher, - num_classes=self.config.num_labels, - focal_alpha=self.config.focal_alpha, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - outputs_loss["pred_masks"] = pred_masks + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1] outputs_class = self.conditional_detr.class_labels_classifier(intermediate) outputs_coord = self.conditional_detr.bbox_predictor(intermediate).sigmoid() - auxiliary_outputs = self.conditional_detr._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - weight_dict["loss_mask"] = self.config.mask_loss_coefficient - weight_dict["loss_dice"] = self.config.dice_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -2151,485 +2105,3 @@ def forward(self, q, k, mask: Optional[Tensor] = None): weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights - - -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -class ConditionalDetrLoss(nn.Module): - """ - This class computes the losses for ConditionalDetrForObjectDetection/ConditionalDetrForSegmentation. The process - happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) - we supervise each pair of matched ground-truth / prediction (supervise class and box). - - Args: - matcher (`ConditionalDetrHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - focal_alpha (`float`): - Alpha parameter in focal loss. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.__init__ - def __init__(self, matcher, num_classes, focal_alpha, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.focal_alpha = focal_alpha - self.losses = losses - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor - of dim [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - target_classes_onehot = torch.zeros( - [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], - dtype=source_logits.dtype, - layout=source_logits.layout, - device=source_logits.device, - ) - target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) - - target_classes_onehot = target_classes_onehot[:, :, :-1] - loss_ce = ( - sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) - * source_logits.shape[1] - ) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_masks - def loss_masks(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the masks: the focal loss and the dice loss. - - Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. - """ - if "pred_masks" not in outputs: - raise KeyError("No predicted masks found in outputs") - - source_idx = self._get_source_permutation_idx(indices) - target_idx = self._get_target_permutation_idx(indices) - source_masks = outputs["pred_masks"] - source_masks = source_masks[source_idx] - masks = [t["masks"] for t in targets] - # TODO use valid to mask invalid areas due to padding in loss - target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(source_masks) - target_masks = target_masks[target_idx] - - # upsample predictions to the target size - source_masks = nn.functional.interpolate( - source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False - ) - source_masks = source_masks[:, 0].flatten(1) - - target_masks = target_masks.flatten(1) - target_masks = target_masks.view(source_masks.shape) - losses = { - "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), - "loss_dice": dice_loss(source_masks, target_masks, num_boxes), - } - return losses - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - # Copied from transformers.models.detr.modeling_detr.DetrLoss.get_loss - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - "masks": self.loss_masks, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - # Copied from transformers.models.detr.modeling_detr.DetrLoss.forward - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - -# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->ConditionalDetr -class ConditionalDetrMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->ConditionalDetr -class ConditionalDetrHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. - alpha = 0.25 - gamma = 2.0 - neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) - pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) - class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index f380c3c3b48139..f47221e55ad8ad 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -37,12 +37,9 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_accelerate_available, is_ninja_available, - is_scipy_available, is_timm_available, is_torch_cuda_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -86,23 +83,10 @@ def load_cuda_kernels(): ) -if is_vision_available(): - from transformers.image_transforms import center_to_corners_format - - -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce - - if is_timm_available(): from timm import create_model -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - - logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "DeformableDetrConfig" @@ -1869,6 +1853,28 @@ def forward( ) +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class DeformableDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + @add_start_docstrings( """ Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on @@ -1887,7 +1893,6 @@ def __init__(self, config: DeformableDetrConfig): # Deformable DETR encoder-decoder model self.model = DeformableDetrModel(config) - # Detection heads on top self.class_embed = nn.Linear(config.d_model, config.num_labels) self.bbox_embed = DeformableDetrMLPPredictionHead( @@ -1922,14 +1927,6 @@ def __init__(self, config: DeformableDetrConfig): # Initialize weights and apply final processing self.post_init() - # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py - @torch.jit.unused - def _set_aux_loss(self, outputs_class, outputs_coord): - # this is a workaround to make torchscript happy, as torchscript - # doesn't support dictionary with non-homogeneous values, such - # as a dict having both a Tensor and a list. - return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] - @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -2034,41 +2031,9 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = DeformableDetrHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = DeformableDetrLoss( - matcher=matcher, - num_classes=self.config.num_labels, - focal_alpha=self.config.focal_alpha, - losses=losses, + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - if self.config.auxiliary_loss: - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - if self.config.two_stage: - enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid() - outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord} - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) - if not return_dict: if auxiliary_outputs is not None: output = (logits, pred_boxes) + auxiliary_outputs + outputs @@ -2099,453 +2064,3 @@ def forward( ) return dict_outputs - - -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -class DeformableDetrLoss(nn.Module): - """ - This class computes the losses for `DeformableDetrForObjectDetection`. The process happens in two steps: 1) we - compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of - matched ground-truth / prediction (supervise class and box). - - Args: - matcher (`DeformableDetrHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - focal_alpha (`float`): - Alpha parameter in focal loss. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - def __init__(self, matcher, num_classes, focal_alpha, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.focal_alpha = focal_alpha - self.losses = losses - - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor - of dim [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - target_classes_onehot = torch.zeros( - [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], - dtype=source_logits.dtype, - layout=source_logits.layout, - device=source_logits.device, - ) - target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) - - target_classes_onehot = target_classes_onehot[:, :, :-1] - loss_ce = ( - sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) - * source_logits.shape[1] - ) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes accross all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - if "enc_outputs" in outputs: - enc_outputs = outputs["enc_outputs"] - bin_targets = copy.deepcopy(targets) - for bt in bin_targets: - bt["class_labels"] = torch.zeros_like(bt["class_labels"]) - indices = self.matcher(enc_outputs, bin_targets) - for loss in self.losses: - l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) - l_dict = {k + "_enc": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - -# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead -class DeformableDetrMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -class DeformableDetrHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. - alpha = 0.25 - gamma = 2.0 - neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) - pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) - class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index c3c1c033e556bf..f51362d94e1722 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -29,10 +29,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_accelerate_available, - is_scipy_available, is_timm_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -41,21 +38,10 @@ from .configuration_detr import DetrConfig -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce - -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - - if is_timm_available(): from timm import create_model -if is_vision_available(): - from transformers.image_transforms import center_to_corners_format - logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "DetrConfig" @@ -1343,6 +1329,28 @@ def forward( ) +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + @add_start_docstrings( """ DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks @@ -1368,14 +1376,6 @@ def __init__(self, config: DetrConfig): # Initialize weights and apply final processing self.post_init() - # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py - @torch.jit.unused - def _set_aux_loss(self, outputs_class, outputs_coord): - # this is a workaround to make torchscript happy, as torchscript - # doesn't support dictionary with non-homogeneous values, such - # as a dict having both a Tensor and a list. - return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] - @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1458,40 +1458,14 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = DetrHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = DetrLoss( - matcher=matcher, - num_classes=self.config.num_labels, - eos_coef=self.config.eos_coefficient, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] outputs_class = self.class_labels_classifier(intermediate) outputs_coord = self.bbox_predictor(intermediate).sigmoid() - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -1542,7 +1516,6 @@ def __init__(self, config: DetrConfig): self.bbox_attention = DetrMHAttentionMap( hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std ) - # Initialize weights and apply final processing self.post_init() @@ -1688,43 +1661,14 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = DetrHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality", "masks"] - criterion = DetrLoss( - matcher=matcher, - num_classes=self.config.num_labels, - eos_coef=self.config.eos_coefficient, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - outputs_loss["pred_masks"] = pred_masks + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1] outputs_class = self.detr.class_labels_classifier(intermediate) outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid() - auxiliary_outputs = self.detr._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - weight_dict["loss_mask"] = self.config.mask_loss_coefficient - weight_dict["loss_dice"] = self.config.dice_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -1861,470 +1805,3 @@ def forward(self, q, k, mask: Optional[Tensor] = None): weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights - - -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py -class DetrLoss(nn.Module): - """ - This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1) - we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair - of matched ground-truth / prediction (supervise class and box). - - A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes` - parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is - the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to - be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2 - (`max_obj_id` + 1). For more details on this, check the following discussion - https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223" - - - Args: - matcher (`DetrHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - eos_coef (`float`): - Relative classification weight applied to the no-object category. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - def __init__(self, matcher, num_classes, eos_coef, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.eos_coef = eos_coef - self.losses = losses - empty_weight = torch.ones(self.num_classes + 1) - empty_weight[-1] = self.eos_coef - self.register_buffer("empty_weight", empty_weight) - - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim - [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - def loss_masks(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the masks: the focal loss and the dice loss. - - Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. - """ - if "pred_masks" not in outputs: - raise KeyError("No predicted masks found in outputs") - - source_idx = self._get_source_permutation_idx(indices) - target_idx = self._get_target_permutation_idx(indices) - source_masks = outputs["pred_masks"] - source_masks = source_masks[source_idx] - masks = [t["masks"] for t in targets] - # TODO use valid to mask invalid areas due to padding in loss - target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(source_masks) - target_masks = target_masks[target_idx] - - # upsample predictions to the target size - source_masks = nn.functional.interpolate( - source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False - ) - source_masks = source_masks[:, 0].flatten(1) - - target_masks = target_masks.flatten(1) - target_masks = target_masks.view(source_masks.shape) - losses = { - "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), - "loss_dice": dice_loss(source_masks, target_masks, num_boxes), - } - return losses - - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - "masks": self.loss_masks, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - -# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py -class DetrMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py -class DetrHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. Contrary to the loss, we don't use the NLL, - # but approximate it in 1 - proba[target class]. - # The 1 is a constant that doesn't change the matching, it can be ommitted. - class_cost = -out_prob[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py - - -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# modified from torchvision to also return the union -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306 -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index f164c4add1fb5e..43882e7f8c0596 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -25,7 +25,6 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1084,18 +1083,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1199,27 +1187,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1302,8 +1271,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 7130a30dc9be58..ca1de9a880fef5 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -20,7 +20,6 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1003,18 +1002,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 8f7e7364b54c95..28f5f5da7ba003 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -24,7 +24,6 @@ import torch import torch.nn as nn import torch.utils.checkpoint -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache @@ -1065,18 +1064,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1258,27 +1246,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1361,8 +1330,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index c0f76dbe5bfcbd..9d7f047e1a8494 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -18,7 +18,6 @@ import torch import torch.nn as nn import torch.utils.checkpoint -from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache @@ -806,18 +805,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index aaac7488f430f5..9c01ce19f32399 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -14,7 +14,6 @@ # limitations under the License. """PyTorch Grounding DINO model.""" -import copy import math import os import warnings @@ -33,31 +32,19 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_scipy_available, is_timm_available, is_torch_cuda_available, - is_vision_available, replace_return_docstrings, requires_backends, ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import meshgrid -from ...utils import is_accelerate_available, is_ninja_available, logging +from ...utils import is_ninja_available, logging from ...utils.backbone_utils import load_backbone from ..auto import AutoModel from .configuration_grounding_dino import GroundingDinoConfig -if is_vision_available(): - from transformers.image_transforms import center_to_corners_format - -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce - -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - if is_timm_available(): from timm import create_model @@ -2488,436 +2475,6 @@ def forward(self, x): return x -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) - - -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino -class GroundingDinoHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. - alpha = 0.25 - gamma = 2.0 - neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) - pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) - class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino -class GroundingDinoLoss(nn.Module): - """ - This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we - compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of - matched ground-truth / prediction (supervise class and box). - - Args: - matcher (`GroundingDinoHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - focal_alpha (`float`): - Alpha parameter in focal loss. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - def __init__(self, matcher, num_classes, focal_alpha, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.focal_alpha = focal_alpha - self.losses = losses - - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor - of dim [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - target_classes_onehot = torch.zeros( - [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], - dtype=source_logits.dtype, - layout=source_logits.layout, - device=source_logits.device, - ) - target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) - - target_classes_onehot = target_classes_onehot[:, :, :-1] - loss_ce = ( - sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) - * source_logits.shape[1] - ) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes accross all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - if "enc_outputs" in outputs: - enc_outputs = outputs["enc_outputs"] - bin_targets = copy.deepcopy(targets) - for bt in bin_targets: - bt["class_labels"] = torch.zeros_like(bt["class_labels"]) - indices = self.matcher(enc_outputs, bin_targets) - for loss in self.losses: - l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) - l_dict = {k + "_enc": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - @add_start_docstrings( """ Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, @@ -3079,40 +2636,9 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = GroundingDinoHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = GroundingDinoLoss( - matcher=matcher, - num_classes=self.config.num_labels, - focal_alpha=self.config.focal_alpha, - losses=losses, + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - if self.config.auxiliary_loss: - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - if self.config.two_stage: - enc_outputs_coord = outputs[-1].sigmoid() - outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord} - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) if not return_dict: if auxiliary_outputs is not None: diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index ddb3b384a8b892..818c6acb3f7961 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -26,7 +26,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache # we need __iter__ and __len__ of pkv @@ -1543,18 +1542,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) aux_loss = None if output_router_logits: @@ -1729,27 +1717,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 805c82be3881bc..a4bb1d78fdc5ce 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -20,7 +20,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from torch.nn import functional as F from ...activations import ACT2FN @@ -1436,27 +1436,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 40db21aeaea7d1..e9064ff3ae5b22 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -24,7 +24,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1147,6 +1146,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1209,18 +1209,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] @@ -1324,27 +1313,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1396,6 +1366,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1427,29 +1398,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, @@ -1526,8 +1484,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 3f26b5fe03d9f1..15eea1ae1f502b 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -25,7 +25,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1220,27 +1220,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1324,8 +1305,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 9248ad2187c38a..7fbfb90cd322b5 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -26,7 +26,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1327,18 +1326,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) aux_loss = None if output_router_logits: @@ -1458,27 +1446,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1562,8 +1531,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index b8d2879612aad2..d1cc3a13bf3cc3 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -21,7 +21,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss from ... import PreTrainedModel from ...activations import ACT2FN @@ -1950,18 +1949,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index aa2fd93fbe916a..6cac7ecdfbe5d9 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -22,7 +22,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import Size, Tensor, nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, StaticCache @@ -1084,18 +1083,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1200,27 +1188,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1273,6 +1242,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1304,29 +1274,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, @@ -1404,8 +1361,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index ff45fffb6e7396..6c7dc59cdbff38 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -26,7 +26,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1127,18 +1126,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index d9d4a9771cd79d..32f7ded42e8901 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -18,7 +18,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1291,18 +1290,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index bc6735ff86b562..d773396010a3cb 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -98,7 +98,7 @@ def to_tuple(self) -> Tuple[Any]: ) -# Copied from transformers.models.detr.modeling_detr._upcast +# Copied from transformers.loss.loss_for_object_detection._upcast def _upcast(t: Tensor) -> Tensor: # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type if t.is_floating_point(): @@ -107,7 +107,7 @@ def _upcast(t: Tensor) -> Tensor: return t if t.dtype in (torch.int32, torch.int64) else t.int() -# Copied from transformers.models.detr.modeling_detr.box_area +# Copied from transformers.loss.loss_for_object_detection.box_area def box_area(boxes: Tensor) -> Tensor: """ Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. @@ -124,7 +124,7 @@ def box_area(boxes: Tensor) -> Tensor: return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) -# Copied from transformers.models.detr.modeling_detr.box_iou +# Copied from transformers.loss.loss_for_object_detection.box_iou def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) @@ -141,7 +141,7 @@ def box_iou(boxes1, boxes2): return iou, union -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +# Copied from transformers.loss.loss_for_object_detection.generalized_box_iou def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 94b815985878a0..7c3e124a207ff7 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -98,7 +98,7 @@ def to_tuple(self) -> Tuple[Any]: ) -# Copied from transformers.models.detr.modeling_detr._upcast +# Copied from transformers.loss.loss_for_object_detection._upcast def _upcast(t: Tensor) -> Tensor: # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type if t.is_floating_point(): @@ -107,7 +107,7 @@ def _upcast(t: Tensor) -> Tensor: return t if t.dtype in (torch.int32, torch.int64) else t.int() -# Copied from transformers.models.detr.modeling_detr.box_area +# Copied from transformers.loss.loss_for_object_detection.box_area def box_area(boxes: Tensor) -> Tensor: """ Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. @@ -124,7 +124,7 @@ def box_area(boxes: Tensor) -> Tensor: return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) -# Copied from transformers.models.detr.modeling_detr.box_iou +# Copied from transformers.loss.loss_for_object_detection.box_iou def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) @@ -141,7 +141,7 @@ def box_iou(boxes1, boxes2): return iou, union -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +# Copied from transformers.loss.loss_for_object_detection.generalized_box_iou def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 61d8b1002f3c15..ddd26729164df2 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -25,7 +25,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1073,27 +1073,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1177,8 +1158,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 807b3fef4f44a0..ef1a5b4d0ec243 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -22,7 +22,7 @@ import torch.utils.checkpoint from packaging import version from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1250,18 +1250,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1366,27 +1355,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + model_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a0b4b2ec378e32..d0865065db1882 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -22,7 +22,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1300,18 +1300,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1455,27 +1444,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + model_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 1da65d7d39be4b..9e24e59c64c2fe 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -21,7 +21,6 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1474,18 +1473,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) aux_loss = None if output_router_logits: @@ -1644,27 +1632,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 2e59ebd5eb98d1..d6f7cd94288a77 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1204,18 +1204,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1423,8 +1412,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 1e741b4a9e3e57..5b0441e02cfbea 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -26,7 +26,6 @@ import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1393,18 +1392,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) aux_loss = None if output_router_logits: @@ -1524,27 +1512,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1628,8 +1597,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index c4daba6d2747ea..21644c4a869a0a 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -37,19 +37,14 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_ninja_available, - is_scipy_available, is_torch_cuda_available, logging, replace_return_docstrings, - requires_backends, ) from ...utils.backbone_utils import load_backbone from .configuration_rt_detr import RTDetrConfig -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - logger = logging.get_logger(__name__) MultiScaleDeformableAttention = None @@ -1616,6 +1611,29 @@ def wrapper(self, *args, **kwargs): return decorator +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class RTDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453 + + """ + + def __init__(self, config, input_dim, d_model, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [d_model] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + @add_start_docstrings( """ RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top. @@ -1950,588 +1968,6 @@ def forward( ) -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -class RTDetrLoss(nn.Module): - """ - This class computes the losses for RTDetr. The process happens in two steps: 1) we compute hungarian assignment - between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / - prediction (supervise class and box). - - Args: - matcher (`DetrHungarianMatcher`): - Module able to compute a matching between targets and proposals. - weight_dict (`Dict`): - Dictionary relating each loss with its weights. These losses are configured in RTDetrConf as - `weight_loss_vfl`, `weight_loss_bbox`, `weight_loss_giou` - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - alpha (`float`): - Parameter alpha used to compute the focal loss. - gamma (`float`): - Parameter gamma used to compute the focal loss. - eos_coef (`float`): - Relative classification weight applied to the no-object category. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - """ - - def __init__(self, config): - super().__init__() - - self.matcher = RTDetrHungarianMatcher(config) - self.num_classes = config.num_labels - self.weight_dict = { - "loss_vfl": config.weight_loss_vfl, - "loss_bbox": config.weight_loss_bbox, - "loss_giou": config.weight_loss_giou, - } - self.losses = ["vfl", "boxes"] - self.eos_coef = config.eos_coefficient - empty_weight = torch.ones(config.num_labels + 1) - empty_weight[-1] = self.eos_coef - self.register_buffer("empty_weight", empty_weight) - self.alpha = config.focal_loss_alpha - self.gamma = config.focal_loss_gamma - - def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True): - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - if "logits" not in outputs: - raise KeyError("No predicted logits found in outputs") - idx = self._get_source_permutation_idx(indices) - - src_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([_target["boxes"][i] for _target, (_, i) in zip(targets, indices)], dim=0) - ious, _ = box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) - ious = torch.diag(ious).detach() - - src_logits = outputs["logits"] - target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) - target_classes = torch.full( - src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device - ) - target_classes[idx] = target_classes_original - target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] - - target_score_original = torch.zeros_like(target_classes, dtype=src_logits.dtype) - target_score_original[idx] = ious.to(target_score_original.dtype) - target_score = target_score_original.unsqueeze(-1) * target - - pred_score = F.sigmoid(src_logits).detach() - weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score - - loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none") - loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes - return {"loss_vfl": loss} - - def loss_labels(self, outputs, targets, indices, num_boxes, log=True): - """Classification loss (NLL) - targets dicts must contain the key "class_labels" containing a tensor of dim [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - - src_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) - target_classes = torch.full( - src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device - ) - target_classes[idx] = target_classes_original - - loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.class_weight) - losses = {"loss_ce": loss_ce} - return losses - - @torch.no_grad() - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. This is not - really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. Targets dicts must - contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes are expected in - format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - src_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - losses = {} - - loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - def loss_masks(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the masks: the focal loss and the dice loss. Targets dicts must contain the key - "masks" containing a tensor of dim [nb_target_boxes, h, w]. - """ - if "pred_masks" not in outputs: - raise KeyError("No predicted masks found in outputs") - - source_idx = self._get_source_permutation_idx(indices) - target_idx = self._get_target_permutation_idx(indices) - source_masks = outputs["pred_masks"] - source_masks = source_masks[source_idx] - masks = [t["masks"] for t in targets] - target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(source_masks) - target_masks = target_masks[target_idx] - - # upsample predictions to the target size - source_masks = nn.functional.interpolate( - source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False - ) - source_masks = source_masks[:, 0].flatten(1) - - target_masks = target_masks.flatten(1) - target_masks = target_masks.view(source_masks.shape) - losses = { - "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), - "loss_dice": dice_loss(source_masks, target_masks, num_boxes), - } - return losses - - def loss_labels_bce(self, outputs, targets, indices, num_boxes, log=True): - src_logits = outputs["logits"] - idx = self._get_source_permutation_idx(indices) - target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) - target_classes = torch.full( - src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device - ) - target_classes[idx] = target_classes_original - - target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] - loss = F.binary_cross_entropy_with_logits(src_logits, target * 1.0, reduction="none") - loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes - return {"loss_bce": loss} - - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True): - if "logits" not in outputs: - raise KeyError("No logits found in outputs") - - src_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)]) - target_classes = torch.full( - src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device - ) - target_classes[idx] = target_classes_original - - target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] - loss = sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma) - loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes - return {"loss_focal": loss} - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - "masks": self.loss_masks, - "bce": self.loss_labels_bce, - "focal": self.loss_labels_focal, - "vfl": self.loss_labels_vfl, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - @staticmethod - def get_cdn_matched_indices(dn_meta, targets): - dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] - num_gts = [len(t["class_labels"]) for t in targets] - device = targets[0]["class_labels"].device - - dn_match_indices = [] - for i, num_gt in enumerate(num_gts): - if num_gt > 0: - gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) - gt_idx = gt_idx.tile(dn_num_group) - assert len(dn_positive_idx[i]) == len(gt_idx) - dn_match_indices.append((dn_positive_idx[i], gt_idx)) - else: - dn_match_indices.append( - ( - torch.zeros(0, dtype=torch.int64, device=device), - torch.zeros(0, dtype=torch.int64, device=device), - ) - ) - - return dn_match_indices - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if "auxiliary_outputs" not in k} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - num_boxes = torch.clamp(num_boxes, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes) - l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} - losses.update(l_dict) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} - l_dict = {k + f"_aux_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - # In case of cdn auxiliary losses. For rtdetr - if "dn_auxiliary_outputs" in outputs: - if "denoising_meta_values" not in outputs: - raise ValueError( - "The output must have the 'denoising_meta_values` key. Please, ensure that 'outputs' includes a 'denoising_meta_values' entry." - ) - indices = self.get_cdn_matched_indices(outputs["denoising_meta_values"], targets) - num_boxes = num_boxes * outputs["denoising_meta_values"]["dn_num_group"] - - for i, auxiliary_outputs in enumerate(outputs["dn_auxiliary_outputs"]): - # indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - kwargs = {} - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes, **kwargs) - l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} - l_dict = {k + f"_dn_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - -# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py -class RTDetrMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453 - - """ - - def __init__(self, config, input_dim, d_model, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [d_model] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -class RTDetrHungarianMatcher(nn.Module): - """This class computes an assignment between the targets and the predictions of the network - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - config: RTDetrConfig - """ - - def __init__(self, config): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = config.matcher_class_cost - self.bbox_cost = config.matcher_bbox_cost - self.giou_cost = config.matcher_giou_cost - - self.use_focal_loss = config.use_focal_loss - self.alpha = config.matcher_alpha - self.gamma = config.matcher_gamma - - if self.class_cost == self.bbox_cost == self.giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """Performs the matching - - Params: - outputs: This is a dict that contains at least these entries: - "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates - - targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: - "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth - objects in the target) containing the class labels - "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates - - Returns: - A list of size batch_size, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: - len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - # Compute the classification cost. Contrary to the loss, we don't use the NLL, - # but approximate it in 1 - proba[target class]. - # The 1 is a constant that doesn't change the matching, it can be ommitted. - if self.use_focal_loss: - out_prob = F.sigmoid(outputs["logits"].flatten(0, 1)) - out_prob = out_prob[:, target_ids] - neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log()) - pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log()) - class_cost = pos_cost_class - neg_cost_class - else: - out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] - class_cost = -out_prob[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - # Compute the giou cost betwen boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - # Compute the final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) - - @add_start_docstrings( """ RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further @@ -2673,39 +2109,26 @@ def forward( outputs_class = outputs.intermediate_logits if return_dict else outputs[2] outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3] - if self.training and denoising_meta_values is not None: - dn_out_coord, outputs_coord = torch.split(outputs_coord, denoising_meta_values["dn_num_split"], dim=2) - dn_out_class, outputs_class = torch.split(outputs_class, denoising_meta_values["dn_num_split"], dim=2) - logits = outputs_class[:, -1] pred_boxes = outputs_coord[:, -1] - loss, loss_dict, auxiliary_outputs = None, None, None + loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None if labels is not None: - # First: create the criterion - criterion = RTDetrLoss(self.config) - criterion.to(self.device) - # Second: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes - if self.config.auxiliary_loss: + if self.training and denoising_meta_values is not None: enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5] enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4] - auxiliary_outputs = self._set_aux_loss( - outputs_class[:, :-1].transpose(0, 1), outputs_coord[:, :-1].transpose(0, 1) - ) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - outputs_loss["auxiliary_outputs"].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) - if self.training and denoising_meta_values is not None: - outputs_loss["dn_auxiliary_outputs"] = self._set_aux_loss( - dn_out_class.transpose(0, 1), dn_out_coord.transpose(0, 1) - ) - outputs_loss["denoising_meta_values"] = denoising_meta_values - - loss_dict = criterion(outputs_loss, labels) - - loss = sum(loss_dict.values()) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + enc_topk_logits=enc_topk_logits, + enc_topk_bboxes=enc_topk_bboxes, + denoising_meta_values=denoising_meta_values, + ) if not return_dict: if auxiliary_outputs is not None: diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 9b445596f6578f..a2356258ce38ed 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -25,7 +25,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -1349,27 +1349,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1453,8 +1434,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 66d36d6db7ce7b..1779337b1a0093 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -25,7 +25,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache @@ -1295,27 +1295,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1399,8 +1380,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 38978e9adad88a..be57ab46016ccf 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -29,10 +29,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_accelerate_available, - is_scipy_available, is_timm_available, - is_vision_available, logging, replace_return_docstrings, requires_backends, @@ -41,18 +38,9 @@ from .configuration_table_transformer import TableTransformerConfig -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - if is_timm_available(): from timm import create_model -if is_vision_available(): - from transformers.image_transforms import center_to_corners_format - -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce logger = logging.get_logger(__name__) @@ -1312,14 +1300,6 @@ def __init__(self, config: TableTransformerConfig): # Initialize weights and apply final processing self.post_init() - @torch.jit.unused - # Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection._set_aux_loss - def _set_aux_loss(self, outputs_class, outputs_coord): - # this is a workaround to make torchscript happy, as torchscript - # doesn't support dictionary with non-homogeneous values, such - # as a dict having both a Tensor and a list. - return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] - @add_start_docstrings_to_model_forward(TABLE_TRANSFORMER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=TableTransformerObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1398,40 +1378,14 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = TableTransformerHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = TableTransformerLoss( - matcher=matcher, - num_classes=self.config.num_labels, - eos_coef=self.config.eos_coefficient, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] outputs_class = self.class_labels_classifier(intermediate) outputs_coord = self.bbox_predictor(intermediate).sigmoid() - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -1456,258 +1410,6 @@ def forward( ) -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.DetrLoss with Detr->TableTransformer,detr->table_transformer -class TableTransformerLoss(nn.Module): - """ - This class computes the losses for TableTransformerForObjectDetection/TableTransformerForSegmentation. The process happens in two steps: 1) - we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair - of matched ground-truth / prediction (supervise class and box). - - A note on the `num_classes` argument (copied from original repo in table_transformer.py): "the naming of the `num_classes` - parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is - the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to - be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2 - (`max_obj_id` + 1). For more details on this, check the following discussion - https://github.com/facebookresearch/table_transformer/issues/108#issuecomment-650269223" - - - Args: - matcher (`TableTransformerHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - eos_coef (`float`): - Relative classification weight applied to the no-object category. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - def __init__(self, matcher, num_classes, eos_coef, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.eos_coef = eos_coef - self.losses = losses - empty_weight = torch.ones(self.num_classes + 1) - empty_weight[-1] = self.eos_coef - self.register_buffer("empty_weight", empty_weight) - - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim - [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - def loss_masks(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the masks: the focal loss and the dice loss. - - Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. - """ - if "pred_masks" not in outputs: - raise KeyError("No predicted masks found in outputs") - - source_idx = self._get_source_permutation_idx(indices) - target_idx = self._get_target_permutation_idx(indices) - source_masks = outputs["pred_masks"] - source_masks = source_masks[source_idx] - masks = [t["masks"] for t in targets] - # TODO use valid to mask invalid areas due to padding in loss - target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(source_masks) - target_masks = target_masks[target_idx] - - # upsample predictions to the target size - source_masks = nn.functional.interpolate( - source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False - ) - source_masks = source_masks[:, 0].flatten(1) - - target_masks = target_masks.flatten(1) - target_masks = target_masks.view(source_masks.shape) - losses = { - "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), - "loss_dice": dice_loss(source_masks, target_masks, num_boxes), - } - return losses - - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - "masks": self.loss_masks, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - # Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->TableTransformer,detr->table_transformer class TableTransformerMLPPredictionHead(nn.Module): """ @@ -1728,200 +1430,3 @@ def forward(self, x): for i, layer in enumerate(self.layers): x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x - - -# Copied from transformers.models.detr.modeling_detr.DetrHungarianMatcher with Detr->TableTransformer -class TableTransformerHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. Contrary to the loss, we don't use the NLL, - # but approximate it in 1 - proba[target class]. - # The 1 is a constant that doesn't change the matching, it can be ommitted. - class_cost = -out_prob[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 9b97d39b4a03c2..2d00c973b85c18 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -21,7 +21,7 @@ import torch import torch.utils.checkpoint -from torch import Tensor, nn +from torch import nn from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling @@ -32,26 +32,12 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, - is_accelerate_available, - is_scipy_available, - is_vision_available, logging, replace_return_docstrings, - requires_backends, ) from .configuration_yolos import YolosConfig -if is_scipy_available(): - from scipy.optimize import linear_sum_assignment - -if is_vision_available(): - from transformers.image_transforms import center_to_corners_format - -if is_accelerate_available(): - from accelerate import PartialState - from accelerate.utils import reduce - logger = logging.get_logger(__name__) # General docstring @@ -728,6 +714,28 @@ def forward(self, hidden_states): return pooled_output +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->Yolos +class YolosMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + @add_start_docstrings( """ YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection. @@ -837,40 +845,14 @@ def forward( loss, loss_dict, auxiliary_outputs = None, None, None if labels is not None: - # First: create the matcher - matcher = YolosHungarianMatcher( - class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost - ) - # Second: create the criterion - losses = ["labels", "boxes", "cardinality"] - criterion = YolosLoss( - matcher=matcher, - num_classes=self.config.num_labels, - eos_coef=self.config.eos_coefficient, - losses=losses, - ) - criterion.to(self.device) - # Third: compute the losses, based on outputs and labels - outputs_loss = {} - outputs_loss["logits"] = logits - outputs_loss["pred_boxes"] = pred_boxes + outputs_class, outputs_coord = None, None if self.config.auxiliary_loss: intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4] outputs_class = self.class_labels_classifier(intermediate) outputs_coord = self.bbox_predictor(intermediate).sigmoid() - auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) - outputs_loss["auxiliary_outputs"] = auxiliary_outputs - - loss_dict = criterion(outputs_loss, labels) - # Fourth: compute total loss, as a weighted sum of the various losses - weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} - weight_dict["loss_giou"] = self.config.giou_loss_coefficient - if self.config.auxiliary_loss: - aux_weight_dict = {} - for i in range(self.config.decoder_layers - 1): - aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) - weight_dict.update(aux_weight_dict) - loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) if not return_dict: if auxiliary_outputs is not None: @@ -889,474 +871,3 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - -# Copied from transformers.models.detr.modeling_detr.dice_loss -def dice_loss(inputs, targets, num_boxes): - """ - Compute the DICE loss, similar to generalized IOU for masks - - Args: - inputs: A float tensor of arbitrary shape. - The predictions for each example. - targets: A float tensor with the same shape as inputs. Stores the binary - classification label for each element in inputs (0 for the negative class and 1 for the positive - class). - """ - inputs = inputs.sigmoid() - inputs = inputs.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss -def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): - """ - Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. - - Args: - inputs (`torch.FloatTensor` of arbitrary shape): - The predictions for each example. - targets (`torch.FloatTensor` with the same shape as `inputs`) - A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class - and 1 for the positive class). - alpha (`float`, *optional*, defaults to `0.25`): - Optional weighting factor in the range (0,1) to balance positive vs. negative examples. - gamma (`int`, *optional*, defaults to `2`): - Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. - - Returns: - Loss tensor - """ - prob = inputs.sigmoid() - ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - # add modulating factor - p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - return loss.mean(1).sum() / num_boxes - - -# Copied from transformers.models.detr.modeling_detr.DetrLoss with Detr->Yolos -class YolosLoss(nn.Module): - """ - This class computes the losses for YolosForObjectDetection/YolosForSegmentation. The process happens in two steps: 1) - we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair - of matched ground-truth / prediction (supervise class and box). - - A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes` - parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is - the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to - be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2 - (`max_obj_id` + 1). For more details on this, check the following discussion - https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223" - - - Args: - matcher (`YolosHungarianMatcher`): - Module able to compute a matching between targets and proposals. - num_classes (`int`): - Number of object categories, omitting the special no-object category. - eos_coef (`float`): - Relative classification weight applied to the no-object category. - losses (`List[str]`): - List of all the losses to be applied. See `get_loss` for a list of all available losses. - """ - - def __init__(self, matcher, num_classes, eos_coef, losses): - super().__init__() - self.matcher = matcher - self.num_classes = num_classes - self.eos_coef = eos_coef - self.losses = losses - empty_weight = torch.ones(self.num_classes + 1) - empty_weight[-1] = self.eos_coef - self.register_buffer("empty_weight", empty_weight) - - # removed logging parameter, which was part of the original implementation - def loss_labels(self, outputs, targets, indices, num_boxes): - """ - Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim - [nb_target_boxes] - """ - if "logits" not in outputs: - raise KeyError("No logits were found in the outputs") - source_logits = outputs["logits"] - - idx = self._get_source_permutation_idx(indices) - target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full( - source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device - ) - target_classes[idx] = target_classes_o - - loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight) - losses = {"loss_ce": loss_ce} - - return losses - - @torch.no_grad() - def loss_cardinality(self, outputs, targets, indices, num_boxes): - """ - Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. - - This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. - """ - logits = outputs["logits"] - device = logits.device - target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) - # Count the number of predictions that are NOT "no-object" (which is the last class) - card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) - losses = {"cardinality_error": card_err} - return losses - - def loss_boxes(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. - - Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes - are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - if "pred_boxes" not in outputs: - raise KeyError("No predicted boxes found in outputs") - idx = self._get_source_permutation_idx(indices) - source_boxes = outputs["pred_boxes"][idx] - target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - - loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") - - losses = {} - losses["loss_bbox"] = loss_bbox.sum() / num_boxes - - loss_giou = 1 - torch.diag( - generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) - ) - losses["loss_giou"] = loss_giou.sum() / num_boxes - return losses - - def loss_masks(self, outputs, targets, indices, num_boxes): - """ - Compute the losses related to the masks: the focal loss and the dice loss. - - Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]. - """ - if "pred_masks" not in outputs: - raise KeyError("No predicted masks found in outputs") - - source_idx = self._get_source_permutation_idx(indices) - target_idx = self._get_target_permutation_idx(indices) - source_masks = outputs["pred_masks"] - source_masks = source_masks[source_idx] - masks = [t["masks"] for t in targets] - # TODO use valid to mask invalid areas due to padding in loss - target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() - target_masks = target_masks.to(source_masks) - target_masks = target_masks[target_idx] - - # upsample predictions to the target size - source_masks = nn.functional.interpolate( - source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False - ) - source_masks = source_masks[:, 0].flatten(1) - - target_masks = target_masks.flatten(1) - target_masks = target_masks.view(source_masks.shape) - losses = { - "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes), - "loss_dice": dice_loss(source_masks, target_masks, num_boxes), - } - return losses - - def _get_source_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) - source_idx = torch.cat([source for (source, _) in indices]) - return batch_idx, source_idx - - def _get_target_permutation_idx(self, indices): - # permute targets following indices - batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) - target_idx = torch.cat([target for (_, target) in indices]) - return batch_idx, target_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes): - loss_map = { - "labels": self.loss_labels, - "cardinality": self.loss_cardinality, - "boxes": self.loss_boxes, - "masks": self.loss_masks, - } - if loss not in loss_map: - raise ValueError(f"Loss {loss} not supported") - return loss_map[loss](outputs, targets, indices, num_boxes) - - def forward(self, outputs, targets): - """ - This performs the loss computation. - - Args: - outputs (`dict`, *optional*): - Dictionary of tensors, see the output specification of the model for the format. - targets (`List[dict]`, *optional*): - List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the - losses applied, see each loss' doc. - """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"} - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["class_labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) - world_size = 1 - if is_accelerate_available(): - if PartialState._shared_state != {}: - num_boxes = reduce(num_boxes) - world_size = PartialState().num_processes - num_boxes = torch.clamp(num_boxes / world_size, min=1).item() - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if "auxiliary_outputs" in outputs: - for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): - indices = self.matcher(auxiliary_outputs, targets) - for loss in self.losses: - if loss == "masks": - # Intermediate masks losses are too costly to compute, we ignore them. - continue - l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) - l_dict = {k + f"_{i}": v for k, v in l_dict.items()} - losses.update(l_dict) - - return losses - - -# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->Yolos -class YolosMLPPredictionHead(nn.Module): - """ - Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, - height and width of a bounding box w.r.t. an image. - - Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py - - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -# Copied from transformers.models.detr.modeling_detr.DetrHungarianMatcher with Detr->Yolos -class YolosHungarianMatcher(nn.Module): - """ - This class computes an assignment between the targets and the predictions of the network. - - For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more - predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are - un-matched (and thus treated as non-objects). - - Args: - class_cost: - The relative weight of the classification error in the matching cost. - bbox_cost: - The relative weight of the L1 error of the bounding box coordinates in the matching cost. - giou_cost: - The relative weight of the giou loss of the bounding box in the matching cost. - """ - - def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): - super().__init__() - requires_backends(self, ["scipy"]) - - self.class_cost = class_cost - self.bbox_cost = bbox_cost - self.giou_cost = giou_cost - if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: - raise ValueError("All costs of the Matcher can't be 0") - - @torch.no_grad() - def forward(self, outputs, targets): - """ - Args: - outputs (`dict`): - A dictionary that contains at least these entries: - * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. - targets (`List[dict]`): - A list of targets (len(targets) = batch_size), where each target is a dict containing: - * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of - ground-truth - objects in the target) containing the class labels - * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. - - Returns: - `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - batch_size, num_queries = outputs["logits"].shape[:2] - - # We flatten to compute the cost matrices in a batch - out_prob = outputs["logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] - out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] - - # Also concat the target labels and boxes - target_ids = torch.cat([v["class_labels"] for v in targets]) - target_bbox = torch.cat([v["boxes"] for v in targets]) - - # Compute the classification cost. Contrary to the loss, we don't use the NLL, - # but approximate it in 1 - proba[target class]. - # The 1 is a constant that doesn't change the matching, it can be ommitted. - class_cost = -out_prob[:, target_ids] - - # Compute the L1 cost between boxes - bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) - - # Compute the giou cost between boxes - giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) - - # Final cost matrix - cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost - cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() - - sizes = [len(v["boxes"]) for v in targets] - indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] - - -# Copied from transformers.models.detr.modeling_detr._upcast -def _upcast(t: Tensor) -> Tensor: - # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type - if t.is_floating_point(): - return t if t.dtype in (torch.float32, torch.float64) else t.float() - else: - return t if t.dtype in (torch.int32, torch.int64) else t.int() - - -# Copied from transformers.models.detr.modeling_detr.box_area -def box_area(boxes: Tensor) -> Tensor: - """ - Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. - - Args: - boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): - Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 - < x2` and `0 <= y1 < y2`. - - Returns: - `torch.FloatTensor`: a tensor containing the area for each box. - """ - boxes = _upcast(boxes) - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -# Copied from transformers.models.detr.modeling_detr.box_iou -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] - inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -# Copied from transformers.models.detr.modeling_detr.generalized_box_iou -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. - - Returns: - `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): - raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") - if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): - raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") - iou, union = box_iou(boxes1, boxes2) - - top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] - area = width_height[:, :, 0] * width_height[:, :, 1] - - return iou - (area - union) / area - - -# Copied from transformers.models.detr.modeling_detr._max_by_axis -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -# Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - if tensor_list[0].ndim == 3: - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - batch_shape = [len(tensor_list)] + max_size - batch_size, num_channels, height, width = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("Only 3-dimensional tensors are supported") - return NestedTensor(tensor, mask) diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 81326c07d6cce9..8a61c15e30a0a9 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1483,18 +1483,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 83fe07fef2eda0..9b8244c243fc4a 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -144,6 +144,57 @@ "initializer_range", "supported_aspect_ratios", ], + "ConditionalDetrConfig": [ + "bbox_cost", + "bbox_loss_coefficient", + "class_cost", + "cls_loss_coefficient", + "dice_loss_coefficient", + "focal_alpha", + "giou_cost", + "giou_loss_coefficient", + "mask_loss_coefficient", + ], + "DetrConfig": [ + "bbox_cost", + "bbox_loss_coefficient", + "class_cost", + "dice_loss_coefficient", + "eos_coefficient", + "giou_cost", + "giou_loss_coefficient", + "mask_loss_coefficient", + ], + "GroundingDinoConfig": [ + "bbox_cost", + "bbox_loss_coefficient", + "class_cost", + "focal_alpha", + "giou_cost", + "giou_loss_coefficient", + ], + "RTDetrConfig": [ + "eos_coefficient", + "focal_loss_alpha", + "focal_loss_gamma", + "matcher_alpha", + "matcher_bbox_cost", + "matcher_class_cost", + "matcher_gamma", + "matcher_giou_cost", + "use_focal_loss", + "weight_loss_bbox", + "weight_loss_giou", + "weight_loss_vfl", + ], + "YolosConfig": [ + "bbox_cost", + "bbox_loss_coefficient", + "class_cost", + "eos_coefficient", + "giou_cost", + "giou_loss_coefficient", + ], } From 7a06d07e14ce4f575ac97287d9af472bd88e9d4c Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 17 Oct 2024 22:37:37 +0200 Subject: [PATCH 045/385] Support Llama 3.2 conversion (text models) (#33778) * Support Llama 3.2 conversion (text models) Co-authored-by: Omar Sanseviero * Fix rope factor * Update chat template Initialize from a well-known template. The guidance is that the changes should be applied to 3.1 models as well. * Remove import * Support Llama Guard 3 conversion * Tokenizer details * Fix eos added token in base models * Fix generation config for base models * Specify revision for known tokenizers * Style * Reuse chat templates for older models * Improve error when converting tokenizer < Llama 3 --------- Co-authored-by: Omar Sanseviero --- .../llama/convert_llama_weights_to_hf.py | 553 +++++++++++------- 1 file changed, 338 insertions(+), 215 deletions(-) diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index 2edf41ade3f91e..eb2862eb203d03 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -15,11 +15,12 @@ import gc import json import os -import shutil +import tempfile import warnings from typing import List import torch +from tokenizers import AddedToken, processors from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast from transformers.convert_slow_tokenizer import TikTokenConverter @@ -39,7 +40,7 @@ ``` python src/transformers/models/llama/convert_llama_weights_to_hf.py \ - --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path + --input_dir /path/to/downloaded/llama/weights --model_size 1B --llama_version 3.2 --output_dir /output/path ``` Thereafter, models can be loaded via: @@ -75,6 +76,8 @@ """ NUM_SHARDS = { + "1B": 1, + "3B": 1, "7B": 1, "8B": 1, "8Bf": 1, @@ -90,7 +93,79 @@ "405B-MP16": 16, } -CONTEXT_LENGTH_FOR_VERSION = {"3.1": 131072, "3": 8192, "2": 4096, "1": 2048} +CONTEXT_LENGTH_FOR_VERSION = {"Guard-3": 131072, "3.2": 131072, "3.1": 131072, "3": 8192, "2": 4096, "1": 2048} + +BOS_ADDED_TOKEN = AddedToken( + "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True +) +EOS_ADDED_TOKEN = AddedToken( + "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True +) +EOT_ADDED_TOKEN = AddedToken( + "<|eot_id|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True +) + +DEFAULT_LLAMA_SPECIAL_TOKENS = { + "3": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], + "3.1": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], + "3.2": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], + "Guard-3": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], +} + + +def is_llama_3(version): + return version in ["3", "3.1", "3.2", "Guard-3"] def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): @@ -116,11 +191,9 @@ def write_model( vocab_size=None, num_shards=None, instruct=False, + push_to_hub=False, ): - os.makedirs(model_path, exist_ok=True) - tmp_model_path = os.path.join(model_path, "tmp") - os.makedirs(tmp_model_path, exist_ok=True) - + print("Converting the model.") params = read_json(os.path.join(input_base_path, "params.json")) num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards params = params.get("model", params) @@ -131,7 +204,7 @@ def write_model( dims_per_head = dim // n_heads base = params.get("rope_theta", 10000.0) inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and float(llama_version) < 3: + if base > 10000.0 and not is_llama_3(llama_version): max_position_embeddings = 16384 else: max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] @@ -149,163 +222,183 @@ def write_model( def permute(w, n_heads, dim1=dim, dim2=dim): return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - print(f"Fetching all parameters from the checkpoint at {input_base_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") - else: - # Sharded - checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) - print("Loading in order:", checkpoint_list) - loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] - param_count = 0 - index_dict = {"weight_map": {}} - for layer_i in range(n_layers): - filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" + with tempfile.TemporaryDirectory() as tmp_model_path: + print(f"Fetching all parameters from the checkpoint at {input_base_path}.") + # Load weights if num_shards == 1: - # Unsharded - state_dict = { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], - } + # Not sharded + # (The sharded implementation would also work, but this is simpler.) + loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") else: # Sharded - # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share - # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is - # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. - - state_dict = { - f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ - f"layers.{layer_i}.attention_norm.weight" - ].clone(), - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ - f"layers.{layer_i}.ffn_norm.weight" - ].clone(), - } - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(len(loaded)) + checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) + print("Loading in order:", checkpoint_list) + loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] + param_count = 0 + index_dict = {"weight_map": {}} + for layer_i in range(n_layers): + filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" + if num_shards == 1: + # Unsharded + state_dict = { + f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads + ), + f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wk.weight"], + n_heads=num_key_value_heads, + dim1=key_value_dim, + ), + f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], + f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], + f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], + f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], + f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], + f"model.layers.{layer_i}.input_layernorm.weight": loaded[ + f"layers.{layer_i}.attention_norm.weight" ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ + f"layers.{layer_i}.ffn_norm.weight" + ], + } + else: + # Sharded + # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share + # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is + # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. + + state_dict = { + f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ + f"layers.{layer_i}.attention_norm.weight" + ].clone(), + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ + f"layers.{layer_i}.ffn_norm.weight" + ].clone(), + } + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wq.weight"].view( + n_heads_per_shard, dims_per_head, dim + ) + for i in range(len(loaded)) + ], + dim=0, + ).reshape(dim, dim), + n_heads=n_heads, + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( + num_key_value_heads_per_shard, dims_per_head, dim + ) + for i in range(len(loaded)) + ], + dim=0, + ).reshape(key_value_dim, dim), + num_key_value_heads, + key_value_dim, + dim, + ) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( + loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( num_key_value_heads_per_shard, dims_per_head, dim ) for i in range(len(loaded)) ], dim=0, - ).reshape(key_value_dim, dim), - num_key_value_heads, - key_value_dim, - dim, - ) - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_key_value_heads_per_shard, dims_per_head, dim - ) - for i in range(len(loaded)) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0 - ) + ).reshape(key_value_dim, dim) + + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0 + ) + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0 + ) + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(tmp_model_path, filename)) + + filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" + if num_shards == 1: + # Unsharded + state_dict = { + "model.embed_tokens.weight": loaded["tok_embeddings.weight"], + "model.norm.weight": loaded["norm.weight"], + "lm_head.weight": loaded["output.weight"], + } + else: + concat_dim = 0 if is_llama_3(llama_version) else 1 + state_dict = { + "model.norm.weight": loaded[0]["norm.weight"], + "model.embed_tokens.weight": torch.cat( + [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim + ), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0), + } - state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq for k, v in state_dict.items(): index_dict["weight_map"][k] = filename param_count += v.numel() torch.save(state_dict, os.path.join(tmp_model_path, filename)) - filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" - if num_shards == 1: - # Unsharded - state_dict = { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - else: - concat_dim = 0 if llama_version in ["3", "3.1"] else 1 - state_dict = { - "model.norm.weight": loaded[0]["norm.weight"], - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim - ), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0), - } - - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - # Write configs - index_dict["metadata"] = {"total_size": param_count * 2} - write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) - ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 - multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + # Write configs + index_dict["metadata"] = {"total_size": param_count * 2} + write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) + ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 + multiple_of = params["multiple_of"] if "multiple_of" in params else 256 - if llama_version in ["3", "3.1"]: - bos_token_id = 128000 + if is_llama_3(llama_version): + bos_token_id = 128000 - if instruct: - eos_token_id = [128001, 128008, 128009] + if instruct: + eos_token_id = [128001, 128008, 128009] + else: + eos_token_id = 128001 else: - eos_token_id = 128001 - else: - bos_token_id = 1 - eos_token_id = 2 - - config = LlamaConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=vocab_size, - rope_theta=base, - max_position_embeddings=max_position_embeddings, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - ) - config.save_pretrained(tmp_model_path) + bos_token_id = 1 + eos_token_id = 2 + + if llama_version in ["3.1", "3.2", "Guard-3"]: + rope_scaling = { + "factor": 32.0 if llama_version == "3.2" else 8.0, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3", + } + else: + rope_scaling = None + + config = LlamaConfig( + hidden_size=dim, + intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + num_key_value_heads=num_key_value_heads, + vocab_size=vocab_size, + rope_theta=base, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=True if llama_version in ["3.2"] else False, + ) + + config.save_pretrained(tmp_model_path) - if instruct: generation_config = GenerationConfig( do_sample=True, temperature=0.6, @@ -315,96 +408,117 @@ def permute(w, n_heads, dim1=dim, dim2=dim): ) generation_config.save_pretrained(tmp_model_path) - # Make space so we can load the model properly now. - del state_dict - del loaded - gc.collect() + # Make space so we can load the model properly now. + del state_dict + del loaded + gc.collect() + + print("Loading the checkpoint in a Llama model.") + model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) - print("Loading the checkpoint in a Llama model.") - model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) - # Avoid saving this as part of the config. - del model.config._name_or_path - model.config.torch_dtype = torch.float16 - print("Saving in the Transformers format.") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - shutil.rmtree(tmp_model_path, ignore_errors=True) + # Avoid saving this as part of the config. + del model.config._name_or_path + model.config.torch_dtype = torch.float16 + + print("Saving in the Transformers format.") + if push_to_hub: + print("Pushing to the hub.") + model.push_to_hub(model_path, safe_serialization=safe_serialization, private=True, use_temp_dir=True) + else: + print("Saving to disk.") + model.save_pretrained(model_path, safe_serialization=safe_serialization) class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs): + def __init__(self, vocab_file, special_tokens=None, instruct=False, llama_version="3.2", **kwargs): super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs) tokenizer = self.converted() - chat_template = ( - "{% set loop_messages = messages %}" - "{% for message in loop_messages %}" - "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}" - "{% if loop.index0 == 0 %}" - "{% set content = bos_token + content %}" - "{% endif %}" - "{{ content }}" - "{% endfor %}" - "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" - ) - self.tokenizer = PreTrainedTokenizerFast( + # References for chat templates in instruct models + templates_for_version = { + "2": ("meta-llama/Llama-2-7b-chat-hf", "f5db02db724555f92da89c216ac04704f23d4590"), + "3": ("meta-llama/Meta-Llama-3-8B-Instruct", "5f0b02c75b57c5855da9ae460ce51323ea669d8a"), + "3.1": ("meta-llama/Llama-3.1-8B-Instruct", "0e9e39f249a16976918f6564b8830bc894c89659"), + "3.2": ("meta-llama/Llama-3.2-1B-Instruct", "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"), + "Guard-3": ("meta-llama/Llama-Guard-3-1B", "acf7aafa60f0410f8f42b1fa35e077d705892029"), + } + + # Add chat_template only if instruct is True. + # Prevents a null chat_template, which triggers + # a parsing warning in the Hub. + additional_kwargs = {} + if instruct or llama_version in ["Guard-3"]: + model_id, revision = templates_for_version.get(llama_version, (None, None)) + if model_id is not None: + from transformers import AutoTokenizer + + t = AutoTokenizer.from_pretrained(model_id, revision=revision) + additional_kwargs["chat_template"] = t.chat_template + + self.converted_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="<|begin_of_text|>", eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", - chat_template=chat_template if instruct else None, model_input_names=["input_ids", "attention_mask"], - model_max_length=model_max_length, + model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version], + clean_up_tokenization_spaces=True, + **additional_kwargs, + ) + self.update_post_processor(self.converted_tokenizer) + # finer special_tokens_map.json + self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN + self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN + + # We can't do this while building the tokenizer because we have no easy access to the bos token id + def update_post_processor(self, tokenizer): + tokenizer._tokenizer.post_processor = processors.Sequence( + [ + processors.ByteLevel(trim_offsets=False), + processors.TemplateProcessing( + single="<|begin_of_text|> $A", + pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1", + special_tokens=[ + ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")), + ], + ), + ] ) -def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False): +def write_tokenizer( + tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False, push_to_hub=False +): + print("Converting the tokenizer.") tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if llama_version in ["3", "3.1"]: + if is_llama_3(llama_version): tokenizer = Llama3Converter( - input_tokenizer_path, special_tokens, instruct, model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version] - ).tokenizer + input_tokenizer_path, + special_tokens, + instruct, + llama_version, + ).converted_tokenizer else: - tokenizer = tokenizer_class(input_tokenizer_path) - print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") - tokenizer.save_pretrained(tokenizer_path) - return tokenizer - + try: + tokenizer = tokenizer_class(input_tokenizer_path) + except Exception: + raise ValueError( + "Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed." + ) -DEFAULT_LLAMA_SPECIAL_TOKENS = { - "3": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] - + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], - "3.1": [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|finetune_right_pad_id|>", - "<|reserved_special_token_2|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|eom_id|>", # end of message - "<|eot_id|>", # end of turn - "<|python_tag|>", - ] - + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], -} + if push_to_hub: + print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.") + tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True) + else: + print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") + tokenizer.save_pretrained(tokenizer_path) + return tokenizer def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input_dir", - help="Location of LLaMA weights, which contains tokenizer.model and model folders", + help="Location of Llama weights, which contains tokenizer.model and model folders", ) parser.add_argument( "--model_size", @@ -416,12 +530,18 @@ def main(): help="Location to write HF model and tokenizer", ) parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + "--push_to_hub", + help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", + action="store_true", + default=False, + ) + parser.add_argument( + "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." ) # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. parser.add_argument( "--llama_version", - choices=["1", "2", "3", "3.1"], + choices=["1", "2", "3", "3.1", "3.2", "Guard-3"], default="1", type=str, help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", @@ -440,9 +560,9 @@ def main(): ) parser.add_argument( "--instruct", + action="store_true", default=False, - type=bool, - help="Whether the model is an instruct model or not. Will affect special tokens for llama 3.1.", + help="Whether the model is an instruct model or not. Will affect special tokens and chat template.", ) args = parser.parse_args() if args.model_size is None and args.num_shards is None: @@ -459,8 +579,10 @@ def main(): llama_version=args.llama_version, special_tokens=args.special_tokens, instruct=args.instruct, + push_to_hub=args.push_to_hub, ) ) + if args.model_size != "tokenizer_only": write_model( model_path=args.output_dir, @@ -471,6 +593,7 @@ def main(): vocab_size=vocab_size, num_shards=args.num_shards, instruct=args.instruct, + push_to_hub=args.push_to_hub, ) From 6ba31a8a94bf7cfeaf59ffc3bc9e0b0cd3e25788 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 17 Oct 2024 17:01:56 -0400 Subject: [PATCH 046/385] Enable users to use their own loss functions + deal with prefetching for grad accum (#34198) * bookmark * Bookmark * Bookmark * Actually implement * Pass in kwarg explicitly * Adjust for if we do or don't have labels * Bookmark fix for od * bookmark * Fin * closer * Negate accelerate grad accum div * Fixup not training long enough * Add in compute_loss to take full model output * Document * compute_loss -> compute_loss_fn * Add a test * Refactor * Refactor * Uncomment tests * Update tests/trainer/test_trainer.py Co-authored-by: Daniel Han --------- Co-authored-by: Daniel Han --- src/transformers/trainer.py | 290 ++++++++++++++++++++-------------- tests/trainer/test_trainer.py | 159 ++++++++++++++++++- 2 files changed, 325 insertions(+), 124 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7e4d1e5d267bb8..58a20f66f4e81b 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -340,12 +340,16 @@ class Trainer: The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to be able to choose different architectures according to hyper parameters (such as layer count, sizes of inner layers, dropout probabilities etc). + compute_loss_func (`Callable`, *optional*): + A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated + batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, here is one using + the loss function from `transformers` compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*): The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered after the last eval batch to signal that the function needs to calculate and return the global summary - statistics rather than accumulating the batch-level statistics. + statistics rather than accumulating the batch-level statistics callbacks (List of [`TrainerCallback`], *optional*): A list of callbacks to customize the training loop. Will add those to the list of default callbacks detailed in [here](callback). @@ -394,6 +398,7 @@ def __init__( Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] ] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, + compute_loss_func: Optional[Callable] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), @@ -415,6 +420,7 @@ def __init__( f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. " ) self.args = args + self.compute_loss_func = compute_loss_func # Seed must be set before instantiating the model when using model enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) @@ -2369,16 +2375,16 @@ def _inner_training_loop( total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - epoch_iterator = train_dataloader - if hasattr(epoch_iterator, "set_epoch"): - epoch_iterator.set_epoch(epoch) + epoch_dataloader = train_dataloader + if hasattr(epoch_dataloader, "set_epoch"): + epoch_dataloader.set_epoch(epoch) # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: self._past = None steps_in_epoch = ( - len(epoch_iterator) + len(epoch_dataloader) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps ) @@ -2390,142 +2396,154 @@ def _inner_training_loop( rng_to_sync = False steps_skipped = 0 if steps_trained_in_current_epoch > 0: - epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + epoch_dataloader = skip_first_batches(epoch_dataloader, steps_trained_in_current_epoch) steps_skipped = steps_trained_in_current_epoch steps_trained_in_current_epoch = 0 rng_to_sync = True step = -1 - for step, inputs in enumerate(epoch_iterator): - total_batched_samples += 1 - - if self.args.include_num_input_tokens_seen: - main_input_name = getattr(self.model, "main_input_name", "input_ids") - if main_input_name not in inputs: - logger.warning( - "Tried to track the number of tokens seen, however the current model is " - "not configured properly to know what item is the input. To fix this, add " - "a `main_input_name` attribute to the model class you are using." - ) + epoch_iterator = iter(epoch_dataloader) + # We chunkify the epoch iterator into gradient accumulation steps `n` batches + remainder = num_examples % args.gradient_accumulation_steps + num_items_in_batch = None + if remainder == 0: + remainder = args.gradient_accumulation_steps + update_step = -1 + total_updates = steps_in_epoch // args.gradient_accumulation_steps + 1 + for _ in range(total_updates): + update_step += 1 + num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder + batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches) + for inputs in batch_samples: + step += 1 + total_batched_samples += 1 + # Since we perform prefetching, we need to manually set sync_gradients + if total_batched_samples % args.gradient_accumulation_steps != 0: + self.accelerator.gradient_state._set_sync_gradients(False) else: - self.state.num_input_tokens_seen += ( - torch.sum( - self.accelerator.gather( - torch.tensor( - inputs[main_input_name].numel(), device=self.args.device, dtype=torch.int64 - ) - ) + self.accelerator.gradient_state._set_sync_gradients(True) + + if self.args.include_num_input_tokens_seen: + main_input_name = getattr(self.model, "main_input_name", "input_ids") + if main_input_name not in inputs: + logger.warning( + "Tried to track the number of tokens seen, however the current model is " + "not configured properly to know what item is the input. To fix this, add " + "a `main_input_name` attribute to the model class you are using." ) - .cpu() - .item() - ) - if rng_to_sync: - self._load_rng_state(resume_from_checkpoint) - rng_to_sync = False - - # Skip past any already trained steps if resuming training - if steps_trained_in_current_epoch > 0: - steps_trained_in_current_epoch -= 1 - if steps_trained_progress_bar is not None: - steps_trained_progress_bar.update(1) - if steps_trained_in_current_epoch == 0: + else: + input_tokens = inputs[main_input_name].numel() + input_tokens = torch.tensor(input_tokens, device=self.args.device, dtype=torch.int64) + self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).cpu().item() + if rng_to_sync: self._load_rng_state(resume_from_checkpoint) - continue - elif steps_trained_progress_bar is not None: - steps_trained_progress_bar.close() - steps_trained_progress_bar = None - - if step % args.gradient_accumulation_steps == 0: - self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - - with self.accelerator.accumulate(model): - tr_loss_step = self.training_step(model, inputs) - - if ( - args.logging_nan_inf_filter - and not is_torch_xla_available() - and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) - ): - # if loss is nan or inf simply add the average of previous logged losses - tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) - else: - if tr_loss.device != tr_loss_step.device: - raise ValueError( - f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}" - ) - tr_loss = tr_loss + tr_loss_step + rng_to_sync = False + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + if steps_trained_progress_bar is not None: + steps_trained_progress_bar.update(1) + if steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + continue + elif steps_trained_progress_bar is not None: + steps_trained_progress_bar.close() + steps_trained_progress_bar = None + + if step % args.gradient_accumulation_steps == 0: + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) + + with self.accelerator.accumulate(model): + tr_loss_step = self.training_step(model, inputs, num_items_in_batch) + + if ( + args.logging_nan_inf_filter + and not is_torch_xla_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): + # if loss is nan or inf simply add the average of previous logged losses + tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) + else: + if tr_loss.device != tr_loss_step.device: + raise ValueError( + f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}" + ) + tr_loss = tr_loss + tr_loss_step - self.current_flos += float(self.floating_point_ops(inputs)) + self.current_flos += float(self.floating_point_ops(inputs)) - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) - if ( - total_batched_samples % args.gradient_accumulation_steps == 0 - or - # last step in epoch but step is always smaller than gradient_accumulation_steps - is_last_step_and_steps_less_than_grad_acc - ): - # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered - # in accelerate. So, explicitly enable sync gradients to True in that case. - if is_last_step_and_steps_less_than_grad_acc: + if ( + (total_batched_samples) % args.gradient_accumulation_steps == 0 + or + # last step in epoch but step is always smaller than gradient_accumulation_steps + is_last_step_and_steps_less_than_grad_acc + ): + # Since we perform prefetching, we need to manually set sync_gradients to True self.accelerator.gradient_state._set_sync_gradients(True) - # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0: - # deepspeed does its own clipping - - if is_sagemaker_mp_enabled() and args.fp16: - _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) - elif self.use_apex: - # Revert to normal clipping otherwise, handling Apex or full precision - _grad_norm = nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer), - args.max_grad_norm, - ) - else: - _grad_norm = self.accelerator.clip_grad_norm_( - model.parameters(), - args.max_grad_norm, - ) - - if ( - is_accelerate_available() - and self.accelerator.distributed_type == DistributedType.DEEPSPEED - ): - grad_norm = model.get_global_grad_norm() - # In some cases the grad norm may not return a float - if hasattr(grad_norm, "item"): - grad_norm = grad_norm.item() - else: - grad_norm = _grad_norm + # Gradient clipping + if args.max_grad_norm is not None and args.max_grad_norm > 0: + # deepspeed does its own clipping + + if is_sagemaker_mp_enabled() and args.fp16: + _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) + elif self.use_apex: + # Revert to normal clipping otherwise, handling Apex or full precision + _grad_norm = nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + _grad_norm = self.accelerator.clip_grad_norm_( + model.parameters(), + args.max_grad_norm, + ) - self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control) + if ( + is_accelerate_available() + and self.accelerator.distributed_type == DistributedType.DEEPSPEED + ): + grad_norm = model.get_global_grad_norm() + # In some cases the grad norm may not return a float + if hasattr(grad_norm, "item"): + grad_norm = grad_norm.item() + else: + grad_norm = _grad_norm - self.optimizer.step() + self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control) - self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) + self.optimizer.step() - optimizer_was_run = not self.accelerator.optimizer_step_was_skipped - if optimizer_was_run: - # Delay optimizer scheduling until metrics are generated - if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): - self.lr_scheduler.step() + self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) - model.zero_grad() - self.state.global_step += 1 - self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch - self.control = self.callback_handler.on_step_end(args, self.state, self.control) + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) - else: - self.control = self.callback_handler.on_substep_end(args, self.state, self.control) + model.zero_grad() + self.state.global_step += 1 + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + else: + self.control = self.callback_handler.on_substep_end(args, self.state, self.control) - if self.control.should_epoch_stop or self.control.should_training_stop: # PyTorch/XLA relies on the data loader to insert the mark_step for # each step. Since we are breaking the loop early, we need to manually # insert the mark_step here. + if self.control.should_epoch_stop or self.control.should_training_stop: + if is_torch_xla_available(): + xm.mark_step() + break + # We also need to break out of the nested loop + if self.control.should_epoch_stop or self.control.should_training_stop: if is_torch_xla_available(): xm.mark_step() break @@ -3514,7 +3532,9 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True): return ctx_manager - def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + def training_step( + self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None + ) -> torch.Tensor: """ Perform a training step on a batch of inputs. @@ -3542,7 +3562,7 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, return loss_mb.reduce_mean().detach().to(self.args.device) with self.compute_loss_context_manager(): - loss = self.compute_loss(model, inputs) + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) del inputs if ( @@ -3575,20 +3595,23 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: + loss *= self.args.gradient_accumulation_steps self.accelerator.backward(loss, **kwargs) return loss.detach() / self.args.gradient_accumulation_steps - def compute_loss(self, model, inputs, return_outputs=False): + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): """ How the loss is computed by Trainer. By default, all models return the loss in the first element. Subclass and override for custom behavior. """ - if self.label_smoother is not None and "labels" in inputs: + if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs: labels = inputs.pop("labels") else: labels = None + # if num_items_in_batch is not None: + # inputs["num_items_in_batch"] = num_items_in_batch outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. @@ -3601,7 +3624,10 @@ def compute_loss(self, model, inputs, return_outputs=False): model_name = unwrapped_model.base_model.model._get_name() else: model_name = unwrapped_model._get_name() - if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + # User-defined compute_loss function + if self.compute_loss_func is not None: + loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch) + elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): loss = self.label_smoother(outputs, labels, shift_labels=True) else: loss = self.label_smoother(outputs, labels) @@ -4993,3 +5019,21 @@ def _fsdp_qlora_plugin_updates(self): fsdp_plugin.set_mixed_precision( self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage, override=True ) + + def get_batch_samples(self, epoch_iterator, num_batches): + batch_samples = [] + num_items_in_batch = None + for _ in range(num_batches): + try: + batch_samples += [next(epoch_iterator)] + except StopIteration: + break + if len(batch_samples) > 0 and "labels" in batch_samples[0]: + # For now we don't support object detection + try: + num_items_in_batch = sum( + [data_batch["labels"][..., 1:].ne(-100).sum().item() for data_batch in batch_samples] + ) + except TypeError: + pass + return batch_samples, num_items_in_batch diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index cbc93faf50e7a3..5c03355785d2b5 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -42,6 +42,7 @@ AutoImageProcessor, AutoProcessor, AutoTokenizer, + DataCollatorForLanguageModeling, IntervalStrategy, PretrainedConfig, TrainerCallback, @@ -49,6 +50,7 @@ get_polynomial_decay_schedule_with_warmup, is_torch_available, logging, + set_seed, ) from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS from transformers.testing_utils import ( @@ -153,6 +155,19 @@ PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt" +class StoreLossCallback(TrainerCallback): + """ + Simple callback to store the loss. + """ + + def __init__(self): + self.losses = [] + + def on_log(self, args, state, control, logs=None, **kwargs): + if "loss" in logs: + self.losses.append(logs["loss"]) + + class MockCudaOOMCallback(TrainerCallback): """ Simple callback to simulate CUDA OOM error if @@ -168,6 +183,26 @@ def on_step_end(self, args, state, control, **kwargs): raise RuntimeError("CUDA out of memory.") +def ForCausalLMLoss(logits, labels, vocab_size, num_items_in_batch, disable_num_items_in_batch=False): + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + # Flatten the tokens + shift_logits = shift_logits.view(-1, vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + if num_items_in_batch is None or disable_num_items_in_batch: + loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100, reduction="mean") + else: + loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100, reduction="sum") + loss = loss / num_items_in_batch + return loss + + class RegressionDataset: def __init__(self, a=2, b=3, length=64, seed=42, label_names=None): np.random.seed(seed) @@ -438,6 +473,31 @@ def forward(self, input_x, labels=None, **kwargs): loss = nn.functional.mse_loss(y, labels) return (loss, y) + class BasicTextGenerationModel(nn.Module): + def __init__(self, vocab_size, hidden_size): + super().__init__() + self.embedding = nn.Embedding(vocab_size, hidden_size) + self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True) + self.fc = nn.Linear(hidden_size, vocab_size) + + def forward(self, input_ids, **kwargs): + embedded = self.embedding(input_ids) + lstm_out, _ = self.lstm(embedded) + logits = self.fc(lstm_out) + return logits + + def create_dummy_dataset_for_text_generation(vocab_size, seq_length, num_samples): + import datasets + import numpy as np + + # Create random input sequences + input_ids = np.random.randint(0, vocab_size, (num_samples, seq_length)) + + # Create a datasets.Dataset + dataset = datasets.Dataset.from_dict({"input_ids": input_ids, "labels": input_ids}) + + return dataset + class TstLayer(nn.Module): def __init__(self, hidden_size): super().__init__() @@ -676,8 +736,105 @@ def test_model_init(self): trainer.train() self.check_trained_model(trainer.model, alternate_seed=True) + @slow + def test_gradient_accumulation_loss_alignment(self): + set_seed(42) + import datasets + + model_name = "distilgpt2" + dataset_name = "wikitext" + dataset_config = "wikitext-2-raw-v1" + dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:500]") + dataset = dataset.train_test_split(test_size=0.2) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + def tokenize_function(examples): + return tokenizer(examples["text"]) + + tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + tokenizer.pad_token = tokenizer.eos_token + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + model = AutoModelForCausalLM.from_pretrained(model_name) + + def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_items_in_batch=False): + return ForCausalLMLoss( + logits["logits"], labels, vocab_size, num_items_in_batch, disable_num_items_in_batch + ) + + loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=False) + + base_loss_callback = StoreLossCallback() + + args_kwargs = { + "report_to": "none", + "logging_steps": 1, + "max_steps": 20, + "learning_rate": 3e-4, + "disable_tqdm": True, + } + + args = TrainingArguments( + "./generation", + **args_kwargs, + ) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[base_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() + + grad_accum_loss_callback = StoreLossCallback() + args = TrainingArguments( + "./generation", + **args_kwargs, + gradient_accumulation_steps=2, + per_device_train_batch_size=4, + ) + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[grad_accum_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() + + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + broken_loss_callback = StoreLossCallback() + loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[broken_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() + + # Calculate the difference between the base loss and the grad_accum loss + diff_truth = [base - grad for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)] + diff_broken = [base - grad for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + # These should be quite close + for diff in diff_truth: + self.assertLess(abs(diff), 0.1, f"Difference {diff} is not within 0.1") + + # These should be very off + for diff in diff_broken: + self.assertGreater(abs(diff), 0.1, f"Difference {diff} is not greater than 0.1") + def test_gradient_accumulation(self): - # Training with half the batch size but accumulation steps as 2 should give the same results. + # Training with half the batch size but accumulation steps as 2 should give the same training losses. trainer = get_regression_trainer( gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1 ) From b54109c7466f6e680156fbd30fa929e2e222d730 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 17 Oct 2024 23:38:35 +0200 Subject: [PATCH 047/385] Fix-red-ci (#34230) * fix copies, skip fx for llama * styke * re-fix copies * last? * style --- .../models/mistral/modeling_mistral.py | 22 +++++-------------- .../models/mixtral/modeling_mixtral.py | 22 +++++-------------- .../models/qwen2/modeling_qwen2.py | 22 +++++-------------- .../models/qwen2_moe/modeling_qwen2_moe.py | 22 +++++-------------- tests/models/llama/test_modeling_llama.py | 4 ++++ 5 files changed, 24 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 15eea1ae1f502b..82d087b23cdd2a 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -1358,6 +1358,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1389,29 +1390,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 7fbfb90cd322b5..7bf7e3ccd7ca28 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1584,6 +1584,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1615,29 +1616,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index d6f7cd94288a77..1941bca17add08 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1465,6 +1465,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1496,29 +1497,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 5b0441e02cfbea..efeb13f90287ba 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1650,6 +1650,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, QuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1681,29 +1682,16 @@ def forward( start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() - total_loss = None + loss = None if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1).to(start_logits.device) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1).to(end_logits.device) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 + loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) if not return_dict: output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output + return ((loss,) + output) if loss is not None else output return QuestionAnsweringModelOutput( - loss=total_loss, + loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index fe521ea410913c..bf7ca7848951c8 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -712,6 +712,10 @@ def test_eager_matches_sdpa_generate(self): msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", ) + @unittest.skip("Broken by the loss update will fix soon @ArthurZucker") + def test_torch_fx_output_loss(self, *args, **kwargs): + pass + @require_torch_gpu class LlamaIntegrationTest(unittest.TestCase): From 5a5b590d060ea59433b2f666453f3314d86f98b1 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 18 Oct 2024 12:17:30 +0200 Subject: [PATCH 048/385] BLIP: fix input expansion logic (#34225) fix --- src/transformers/models/blip_2/processing_blip_2.py | 4 +++- .../models/instructblip/processing_instructblip.py | 4 +++- .../models/instructblipvideo/processing_instructblipvideo.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 606aadc1eab45f..fa6a99f71a4616 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -137,7 +137,9 @@ def __call__( # because BLIP expects image tokens to be at the beginning even before BOS token if self.num_query_tokens is not None: image_tokens = self.image_token.content * self.num_query_tokens - image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None) + image_token_encoding = self.tokenizer( + [image_tokens] * len(text), add_special_tokens=False, return_tensors=None + ) for k in _text_encoding: text_encoding[k] = [ img_encoding + txt_encoding diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index dc6c9deaf17781..05ff9871f4d731 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -131,7 +131,9 @@ def __call__( if self.num_query_tokens is not None and images is not None: text_encoding = {} image_tokens = self.image_token.content * self.num_query_tokens - image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None) + image_token_encoding = self.tokenizer( + [image_tokens] * len(text), add_special_tokens=False, return_tensors=None + ) for k in _text_encoding: text_encoding[k] = [ img_encoding + txt_encoding diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 39bcc6a06c3595..3e96d279a42f8d 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -131,7 +131,9 @@ def __call__( video_tokens = ( self.video_token.content * self.num_query_tokens * 4 ) # InstrucBLIP works with 4 frames only - video_token_encoding = self.tokenizer([video_tokens], add_special_tokens=False, return_tensors=None) + video_token_encoding = self.tokenizer( + [video_tokens] * len(text), add_special_tokens=False, return_tensors=None + ) for k in _text_encoding: text_encoding[k] = [ img_encoding + txt_encoding From 0437d6cd03f24766fec93e950c74abdc0b6183e4 Mon Sep 17 00:00:00 2001 From: byi8220 Date: Fri, 18 Oct 2024 07:54:55 -0400 Subject: [PATCH 049/385] Fix broken test decorator `require_torch_up_to_2_accelerators` (#34201) * fix broken require_torch_up_to_2_accelerators * make style --- src/transformers/testing_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 2fc22551d37f1b..7bb2d5049dccf8 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -827,8 +827,9 @@ def require_torch_up_to_2_accelerators(test_case): if not is_torch_available(): return unittest.skip(reason="test requires PyTorch")(test_case) - return unittest.skipUnless(backend_device_count(torch_device) < 3, "test requires 0 or 1 or 2 accelerators") - (test_case) + return unittest.skipUnless(backend_device_count(torch_device) < 3, "test requires 0 or 1 or 2 accelerators")( + test_case + ) def require_torch_xla(test_case): From e95ea479eebb6e01679907db910b5dc5eb64b3c7 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Fri, 18 Oct 2024 14:12:15 +0200 Subject: [PATCH 050/385] Informative 2 (#34154) * Informative * style * Informative 2 * Apply suggestions from code review Co-authored-by: lewtun --------- Co-authored-by: lewtun --- .../models/marian/convert_marian_to_pytorch.py | 2 +- ...rt_maskformer_original_pytorch_checkpoint_to_pytorch.py | 3 ++- .../models/mobilevitv2/convert_mlcvnets_to_pytorch.py | 2 +- .../convert_reformer_trax_checkpoint_to_pytorch.py | 7 ++++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py index 3938bc794571b4..4cc9b15ce4d518 100644 --- a/src/transformers/models/marian/convert_marian_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_to_pytorch.py @@ -704,7 +704,7 @@ def unzip(zip_path: str, dest_dir: str) -> None: parser.add_argument( "--src", type=str, - help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be weary of which file you're loading.", + help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be wary of which file you're loading.", default="en-de", ) parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.") diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py index 999eee136afbe1..8b73c6824550bc 100644 --- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py @@ -641,7 +641,8 @@ def get_name(checkpoint_file: Path): type=Path, help=( "A directory containing the model's checkpoints. The directory has to have the following structure:" - " //.pkl" + " //.pkl\n" + "Given the files are in the pickle format, please be wary of passing it files you trust." ), ) parser.add_argument( diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py index e0aed91cd28977..7b2f53f8d77e74 100644 --- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py +++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py @@ -318,7 +318,7 @@ def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, "--orig_config_path", required=True, type=str, - help="Path to the original config file. yaml.load will be used to load the file, please be weary of which file you're loading.", + help="Path to the original config file. yaml.load will be used to load the file, please be wary of which file you're loading.", ) parser.add_argument( "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory." diff --git a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py index ad6a0775817df7..7e287a47bfed42 100755 --- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py +++ b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py @@ -202,7 +202,12 @@ def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch parser = argparse.ArgumentParser() # Required parameters parser.add_argument( - "--trax_model_pkl_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + "--trax_model_pkl_path", + default=None, + type=str, + required=True, + help="Path to the TensorFlow checkpoint path.\n" + "Given the files are in the pickle format, please be wary of passing it files you trust.", ) parser.add_argument( "--config_file", From 66047640076a60e27c031726ec489e0b79143150 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Fri, 18 Oct 2024 17:41:12 +0200 Subject: [PATCH 051/385] add Glm (#33823) * Create modular_glm.py * Update modular_glm.py * Finalize architecture without all attentions * Add all attentions modules * Finalize modular * Update given last version * Last update * Finalize model * Finalize converter * Update convert_glm_weights_to_hf.py * style * style * Create __init__.py * Aff all inits * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Correct the rotary embeddings * Remove apply_residual_connection_post_layernorm (always false) * remove use_rms_norm (always true) * remove past_layer_norm (always true) * Update __init__.py * Update config and license * start adding tests and doc * Add doc + style * Update test_modeling_glm.py * Add dummies * Apply correct modeling * Refactor attention to follow llama * Update __init__.py * Update convert_glm_weights_to_hf.py * Correct bias * remove linear_bias and pdrop (never used) * apply modular * Simplify converter * remove dummies + style * add model_input_names * Add pretraining_tp to config for when eager attention is used * Update modular to remove all pretraining_tp * Update test_modeling_glm.py * Update the __all__ * Update __all__ * Update __init__.py * Update test_modeling_glm.py * add revisions * Add the correct repos and revisions * style * Update __init__.py * update exports * remove import of modular files * style * Apply Llama changes + refine converter * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * Update convert_glm_weights_to_hf.py * style * Use new modular converter * add pretrainedmodel to init * style * Update test_modeling_glm.py * Move config outside modular to please CI about docstrings * Add dummies to please CI * Update glm.md * Update glm.md --- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/glm.md | 99 ++ docs/source/en/perf_infer_gpu_one.md | 2 + src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 4 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/glm/__init__.py | 27 + .../models/glm/configuration_glm.py | 136 ++ .../models/glm/convert_glm_weights_to_hf.py | 174 +++ src/transformers/models/glm/modeling_glm.py | 1313 +++++++++++++++++ src/transformers/models/glm/modular_glm.py | 188 +++ src/transformers/utils/dummy_pt_objects.py | 35 + src/transformers/utils/import_utils.py | 7 + tests/models/glm/__init__.py | 0 tests/models/glm/test_modeling_glm.py | 955 ++++++++++++ tests/test_modeling_common.py | 14 +- 19 files changed, 2975 insertions(+), 4 deletions(-) create mode 100644 docs/source/en/model_doc/glm.md create mode 100644 src/transformers/models/glm/__init__.py create mode 100644 src/transformers/models/glm/configuration_glm.py create mode 100644 src/transformers/models/glm/convert_glm_weights_to_hf.py create mode 100644 src/transformers/models/glm/modeling_glm.py create mode 100644 src/transformers/models/glm/modular_glm.py create mode 100644 tests/models/glm/__init__.py create mode 100644 tests/models/glm/test_modeling_glm.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 016d7279353d95..aa975fc9d9fe6b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -414,6 +414,8 @@ title: Gemma - local: model_doc/gemma2 title: Gemma2 + - local: model_doc/glm + title: GLM - local: model_doc/openai-gpt title: GPT - local: model_doc/gpt_neo diff --git a/docs/source/en/index.md b/docs/source/en/index.md index bdea11a2456fef..ce0ffc7db0512f 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -150,6 +150,7 @@ Flax), PyTorch, and/or TensorFlow. | [Gemma](model_doc/gemma) | ✅ | ❌ | ✅ | | [Gemma2](model_doc/gemma2) | ✅ | ❌ | ❌ | | [GIT](model_doc/git) | ✅ | ❌ | ❌ | +| [GLM](model_doc/glm) | ✅ | ❌ | ❌ | | [GLPN](model_doc/glpn) | ✅ | ❌ | ❌ | | [GPT Neo](model_doc/gpt_neo) | ✅ | ❌ | ✅ | | [GPT NeoX](model_doc/gpt_neox) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md new file mode 100644 index 00000000000000..be0b367b62ec02 --- /dev/null +++ b/docs/source/en/model_doc/glm.md @@ -0,0 +1,99 @@ + + +# GLM + +## Overview + +The GLM Model was proposed +in [ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools](https://arxiv.org/html/2406.12793v1) +by GLM Team, THUDM & ZhipuAI. + +The abstract from the paper is the following: + +*We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report +primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most +capable models that are trained with all the insights and lessons gained from the preceding three generations of +ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with +a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment +is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human +feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU, +GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3) +matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as +measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide +when and which tool(s) to use—including web browser, Python interpreter, text-to-image model, and user-defined +functions—to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All +Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter. +Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M), +GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone.* + +Tips: + +- This model was contributed by [THUDM](https://huggingface.co/THUDM). The most recent code can be + found [here](https://github.com/thudm/GLM-4). + + +## Usage tips + +`GLM-4` can be found on the [Huggingface Hub](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7) + +In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> device = "cuda" # the device to load the model onto + +>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat") + +>>> prompt = "Give me a short introduction to large language model." + +>>> messages = [{"role": "user", "content": prompt}] + +>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + +>>> model_inputs = tokenizer([text], return_tensors="pt").to(device) + +>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) + +>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + +>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` + +## GlmConfig + +[[autodoc]] GlmConfig + +## GlmModel + +[[autodoc]] GlmModel + - forward + +## GlmForCausalLM + +[[autodoc]] GlmForCausalLM + - forward + +## GlmForSequenceClassification + +[[autodoc]] GlmForSequenceClassification + - forward + +## GlmForTokenClassification + +[[autodoc]] GlmForTokenClassification + - forward diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 2f0e9deb841d4d..9c03d06d94ad48 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -42,6 +42,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) +* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) @@ -216,6 +217,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel) * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) +* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e48b2599d4c298..a926a848c3b5f1 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -454,6 +454,7 @@ "GitProcessor", "GitVisionConfig", ], + "models.glm": ["GlmConfig"], "models.glpn": ["GLPNConfig"], "models.gpt2": [ "GPT2Config", @@ -2294,6 +2295,15 @@ "GitVisionModel", ] ) + _import_structure["models.glm"].extend( + [ + "GlmForCausalLM", + "GlmForSequenceClassification", + "GlmForTokenClassification", + "GlmModel", + "GlmPreTrainedModel", + ] + ) _import_structure["models.glpn"].extend( [ "GLPNForDepthEstimation", @@ -5304,6 +5314,7 @@ GitProcessor, GitVisionConfig, ) + from .models.glm import GlmConfig from .models.glpn import GLPNConfig from .models.gpt2 import ( GPT2Config, @@ -7024,6 +7035,13 @@ GitPreTrainedModel, GitVisionModel, ) + from .models.glm import ( + GlmForCausalLM, + GlmForSequenceClassification, + GlmForTokenClassification, + GlmModel, + GlmPreTrainedModel, + ) from .models.glpn import ( GLPNForDepthEstimation, GLPNModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 069c7f90564fce..9155f629e63f91 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -97,6 +97,7 @@ gemma, gemma2, git, + glm, glpn, gpt2, gpt_bigcode, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 05d6e717be23d2..48625ea3f346cd 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -114,6 +114,7 @@ ("gemma", "GemmaConfig"), ("gemma2", "Gemma2Config"), ("git", "GitConfig"), + ("glm", "GlmConfig"), ("glpn", "GLPNConfig"), ("gpt-sw3", "GPT2Config"), ("gpt2", "GPT2Config"), @@ -416,6 +417,7 @@ ("gemma", "Gemma"), ("gemma2", "Gemma2"), ("git", "GIT"), + ("glm", "GLM"), ("glpn", "GLPN"), ("gpt-sw3", "GPT-Sw3"), ("gpt2", "OpenAI GPT-2"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5a98e761adc13b..67c539fca66496 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -111,6 +111,7 @@ ("gemma", "GemmaModel"), ("gemma2", "Gemma2Model"), ("git", "GitModel"), + ("glm", "GlmModel"), ("glpn", "GLPNModel"), ("gpt-sw3", "GPT2Model"), ("gpt2", "GPT2Model"), @@ -486,6 +487,7 @@ ("gemma", "GemmaForCausalLM"), ("gemma2", "Gemma2ForCausalLM"), ("git", "GitForCausalLM"), + ("glm", "GlmForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), ("gpt_bigcode", "GPTBigCodeForCausalLM"), @@ -941,6 +943,7 @@ ("funnel", "FunnelForSequenceClassification"), ("gemma", "GemmaForSequenceClassification"), ("gemma2", "Gemma2ForSequenceClassification"), + ("glm", "GlmForSequenceClassification"), ("gpt-sw3", "GPT2ForSequenceClassification"), ("gpt2", "GPT2ForSequenceClassification"), ("gpt_bigcode", "GPTBigCodeForSequenceClassification"), @@ -1131,6 +1134,7 @@ ("funnel", "FunnelForTokenClassification"), ("gemma", "GemmaForTokenClassification"), ("gemma2", "Gemma2ForTokenClassification"), + ("glm", "GlmForTokenClassification"), ("gpt-sw3", "GPT2ForTokenClassification"), ("gpt2", "GPT2ForTokenClassification"), ("gpt_bigcode", "GPTBigCodeForTokenClassification"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 63549202969ab9..7674ea51a53377 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -204,6 +204,7 @@ ), ), ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/glm/__init__.py b/src/transformers/models/glm/__init__.py new file mode 100644 index 00000000000000..0636c800beea6b --- /dev/null +++ b/src/transformers/models/glm/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_glm import * + from .modeling_glm import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py new file mode 100644 index 00000000000000..85d32a7c691a18 --- /dev/null +++ b/src/transformers/models/glm/configuration_glm.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig + + +class GlmConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Glm-4-9b-chat. + e.g. [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 151552): + Vocabulary size of the Glm model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GlmModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 13696): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 128): + The attention head dimension. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The legacy activation function. It is overwritten by the `hidden_activation`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 131072): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + pad_token_id (`int`, *optional*, defaults to 151329): + Padding token id. + eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`): + End of stream token id. + bos_token_id (`int`, *optional*): + Beginning of stream token id. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + ```python + >>> from transformers import GlmModel, GlmConfig + >>> # Initializing a Glm glm-4-9b-chat style configuration + >>> configuration = GlmConfig() + >>> # Initializing a model from the glm-4-9b-chat style configuration + >>> model = GlmModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151552, + hidden_size=4096, + intermediate_size=13696, + num_hidden_layers=40, + num_attention_heads=32, + num_key_value_heads=2, + head_dim=128, + hidden_act="silu", + attention_dropout=0.0, + max_position_embeddings=131072, + initializer_range=0.02, + rms_norm_eps=0.00000015625, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + pad_token_id=151329, + eos_token_id=[151329, 151336, 151338], + bos_token_id=None, + attention_bias=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["GlmConfig"] diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py new file mode 100644 index 00000000000000..3878ce0d25814a --- /dev/null +++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py @@ -0,0 +1,174 @@ +import argparse +import json +import os +import re + +import torch +from safetensors.torch import load_file +from tokenizers import processors + +from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast + + +# fmt: off +# `None` means we drop the key +STATE_DICT_MAPPING = { + # CausalLM keys + r"transformer.output_layer.weight": r"lm_head.weight", + + # Model keys + r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", + r"transformer.rotary_pos_emb.inv_freq": None, + r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", + + # Layers keys + r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", + r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", + + # Attention keys + r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", + # qkv_proj will later be split in q|k|v|_proj + r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", + + # MLP keys + r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", + r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", +} +# fmt: on + + +def merge_safetensors(input_dir: str): + all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] + all_files = sorted(all_files, key=lambda x: int(x.rsplit("-", 3)[1])) + + all_weights = {} + for file in all_files: + tensors = load_file(file) + all_weights.update(tensors) + + return all_weights + + +def map_old_key_to_new(old_key): + for pattern, replacement in STATE_DICT_MAPPING.items(): + if replacement is None: + if re.fullmatch(pattern, old_key): + return None + else: + new_key, n_replace = re.subn(pattern, replacement, old_key) + # Early exit of the loop + if n_replace > 0: + return new_key + + raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") + + +def convert_state_dict(original_state_dict: dict, config: GlmConfig): + new_dict = {} + + head_dim = config.hidden_size // config.num_attention_heads + query_size = config.num_attention_heads * head_dim + kv_size = config.num_key_value_heads * head_dim + + for old_key, value in original_state_dict.items(): + new_key = map_old_key_to_new(old_key) + if new_key is None: + continue + + if "qkv_proj." in new_key: + q_proj, k_proj, v_proj = ( + value[:query_size, ...], + value[query_size : query_size + kv_size, ...], + value[query_size + kv_size :, ...], + ) + new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj + new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj + new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj + else: + new_dict[new_key] = value + return new_dict + + +def convert_config(original_config: dict): + key_mapping = { + "vocab_size": "padded_vocab_size", + "intermediate_size": "ffn_hidden_size", + "num_hidden_layers": "num_layers", + "max_position_embeddings": "seq_length", + "rms_norm_eps": "layernorm_epsilon", + "head_dim": "kv_channels", + "attention_bias": "add_qkv_bias", + } + similar_keys_to_keep = [ + "num_attention_heads" "hidden_size", + "attention_dropout", + "use_cache", + "eos_token_id", + "pad_token_id", + "tie_word_embeddings", + ] + new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} + new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) + new_config_kwargs["num_key_value_heads"] = ( + new_config_kwargs["num_attention_heads"] + if not original_config["multi_query_attention"] + else original_config["multi_query_group_num"] + ) + new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) + + new_config = GlmConfig(**new_config_kwargs) + return new_config + + +def convert_glm_tokenizer(input_dir): + fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) + # Add the two tokens automatically with post processor + fast_tok._tokenizer.post_processor = processors.Sequence( + [ + processors.ByteLevel(trim_offsets=False), + processors.TemplateProcessing( + single="[gMASK]:0 :0 $A:0", + pair="[gMASK]:0 :0 $A:0 $B:1", + special_tokens=[("[gMASK]", 151331), ("", 151333)], + ), + ], + ) + + return fast_tok + + +def convert_glm_model(input_dir, output_dir): + # Load and convert config + with open(os.path.join(input_dir, "config.json")) as f: + original_config = json.load(f) + config = convert_config(original_config) + config.save_pretrained(output_dir) + + # Load and convert weights + original_state_dict = merge_safetensors(input_dir) + new_dict = convert_state_dict(original_state_dict, config) + with torch.device("meta"): + model = GlmForCausalLM(config) + model.load_state_dict(new_dict, strict=True, assign=True) + model.save_pretrained(output_dir) + + # Load and convert tokenizer + tokenizer = convert_glm_tokenizer(input_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "input_dir", + type=str, + help="Location of the local folder copied from the Hub.", + ) + parser.add_argument( + "output_dir", + type=str, + help="Location to write HF model and tokenizer", + ) + + args = parser.parse_args() + convert_glm_model(args.input_dir, args.output_dir) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py new file mode 100644 index 00000000000000..9815dbc78992ed --- /dev/null +++ b/src/transformers/models/glm/modeling_glm.py @@ -0,0 +1,1313 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/glm/modular_glm.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_glm.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_glm import GlmConfig + + +if is_flash_attn_2_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + +from ...modeling_flash_attention_utils import _flash_attention_forward + + +class GlmRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + GlmRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class GlmRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class GlmMLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +logger = logging.get_logger(__name__) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., 0::2] + x2 = x[..., 1::2] + return torch.stack((-x2, x1), dim=-1).flatten(-2) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + # Keep half for later concatenation + q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :] + k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :] + + # Apply rotary embeddings on the first half + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class GlmAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.is_causal = True + self.scaling = 1 / math.sqrt(self.head_dim) + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class GlmFlashAttention2(GlmAttention): + """ + Glm flash attention module. This module inherits from `GlmAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (GlmRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + softmax_scale=self.scaling, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class GlmSdpaAttention(GlmAttention): + """ + Glm attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `GlmAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from GlmAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "GlmModel is using GlmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + scale=self.scaling, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +GLM_ATTENTION_CLASSES = { + "eager": GlmAttention, + "flash_attention_2": GlmFlashAttention2, + "sdpa": GlmSdpaAttention, +} + + +class GlmDecoderLayer(nn.Module): + def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = GlmMLP(config) + self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +GLM_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GlmConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Glm Model outputting raw hidden-states without any specific head on top.", + GLM_START_DOCSTRING, +) +class GlmPreTrainedModel(PreTrainedModel): + config_class = GlmConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["GlmDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +_CONFIG_FOR_DOC = "GlmConfig" + + +GLM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Glm Model outputting raw hidden-states without any specific head on top.", + GLM_START_DOCSTRING, +) +class GlmModel(GlmPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GlmDecoderLayer`] + + Args: + config: GlmConfig + """ + + def __init__(self, config: GlmConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = GlmRotaryEmbedding( + dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta + ) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: GlmConfig): + super().__init__(config) + self.model = GlmModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, GlmForCausalLM + + >>> model = GlmForCausalLM.from_pretrained("google/glm-7b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/glm-7b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + + loss = None + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + The Glm Model transformer with a sequence classification head on top (linear layer). + + [`GlmForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + GLM_START_DOCSTRING, +) +class GlmForSequenceClassification(GlmPreTrainedModel): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.model = GlmModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The Glm Model transformer with a token classification head on top (a linear layer on top of the hidden-states + output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + GLM_START_DOCSTRING, +) +class GlmForTokenClassification(GlmPreTrainedModel): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.model = GlmModel(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.score = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.score(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "GlmPreTrainedModel", + "GlmModel", + "GlmForCausalLM", + "GlmForSequenceClassification", + "GlmForTokenClassification", +] diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py new file mode 100644 index 00000000000000..55bf89d1c56b28 --- /dev/null +++ b/src/transformers/models/glm/modular_glm.py @@ -0,0 +1,188 @@ +# coding=utf-8 +# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Optional + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from ...utils import logging +from ..gemma.modeling_gemma import ( + GemmaForCausalLM, + GemmaForSequenceClassification, + GemmaForTokenClassification, +) +from ..granite.modeling_granite import ( + GraniteAttention, + GraniteFlashAttention2, + GraniteSdpaAttention, +) +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaModel, + LlamaPreTrainedModel, +) +from ..phi3.modeling_phi3 import ( + Phi3MLP, + Phi3RMSNorm, + Phi3RotaryEmbedding, +) +from .configuration_glm import GlmConfig + + +logger = logging.get_logger(__name__) + + +class GlmRMSNorm(Phi3RMSNorm): + pass + + +class GlmRotaryEmbedding(Phi3RotaryEmbedding): + pass + + +class GlmMLP(Phi3MLP): + pass + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., 0::2] + x2 = x[..., 1::2] + return torch.stack((-x2, x1), dim=-1).flatten(-2) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + # Keep half for later concatenation + q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :] + k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :] + + # Apply rotary embeddings on the first half + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +class GlmAttention(GraniteAttention): + def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.scaling = 1 / math.sqrt(self.head_dim) + + +class GlmFlashAttention2(GlmAttention, GraniteFlashAttention2): + pass + + +class GlmSdpaAttention(GraniteSdpaAttention): + pass + + +GLM_ATTENTION_CLASSES = { + "eager": GlmAttention, + "flash_attention_2": GlmFlashAttention2, + "sdpa": GlmSdpaAttention, +} + + +class GlmDecoderLayer(LlamaDecoderLayer): + def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None): + super().__init__() + + self.mlp = GlmMLP(config) + self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class GlmPreTrainedModel(LlamaPreTrainedModel): + pass + + +class GlmModel(GlmPreTrainedModel, LlamaModel): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = GlmRotaryEmbedding( + dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta + ) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + +class GlmForCausalLM(GemmaForCausalLM): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.model = GlmModel(config) + self.post_init() + + +class GlmForSequenceClassification(GemmaForSequenceClassification): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.model = GlmModel(config) + self.post_init() + + +class GlmForTokenClassification(GemmaForTokenClassification): + def __init__(self, config: GlmConfig): + super().__init__(config) + self.model = GlmModel(config) + self.post_init() + + +__all__ = [ + "GlmPreTrainedModel", + "GlmModel", + "GlmForCausalLM", + "GlmForSequenceClassification", + "GlmForTokenClassification", +] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index d7570c57c62f36..e109ea659c74e0 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4368,6 +4368,41 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class GlmForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmForTokenClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class GLPNForDepthEstimation(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 2f0cfe1d6dcec8..ed95e4b5570e68 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -1943,6 +1943,13 @@ def create_import_structure_from_path(module_path): if "__init__.py" in adjacent_modules: adjacent_modules.remove("__init__.py") + # Modular files should not be imported + def find_substring(substring, list_): + return any(substring in x for x in list_) + + if find_substring("modular_", adjacent_modules) and find_substring("modeling_", adjacent_modules): + adjacent_modules = [module for module in adjacent_modules if "modular_" not in module] + module_requirements = {} for module_name in adjacent_modules: # Only modules ending in `.py` are accepted here. diff --git a/tests/models/glm/__init__.py b/tests/models/glm/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py new file mode 100644 index 00000000000000..f703ccd5096d41 --- /dev/null +++ b/tests/models/glm/test_modeling_glm.py @@ -0,0 +1,955 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Glm model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +import pytest +from parameterized import parameterized + +from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available +from transformers.testing_utils import ( + is_flaky, + require_flash_attn, + require_torch, + require_torch_accelerator, + require_torch_gpu, + require_torch_sdpa, + slow, + torch_device, +) +from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GlmForCausalLM, + GlmForSequenceClassification, + GlmForTokenClassification, + GlmModel, + ) + + +@require_torch +class GlmModelTester: + config_class = GlmConfig + if is_torch_available(): + model_class = GlmModel + for_causal_lm_class = GlmForCausalLM + for_sequence_class = GlmForSequenceClassification + for_token_class = GlmForTokenClassification + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + intermediate_size=37, + hidden_act="silu", + attention_dropout=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + self.head_dim = self.hidden_size // self.num_attention_heads + + # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return self.config_class( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + head_dim=self.head_dim, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.model_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = self.model_class(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = self.for_causal_lm_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = self.for_causal_lm_class(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + )["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Glm + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (GlmModel, GlmForCausalLM, GlmForSequenceClassification, GlmForTokenClassification) + if is_torch_available() + else () + ) + all_generative_model_classes = (GlmForCausalLM,) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": GlmModel, + "text-classification": GlmForSequenceClassification, + "token-classification": GlmForTokenClassification, + "text-generation": GlmForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + + # used in `test_torch_compile` + _torch_compile_test_ckpt = "THUDM/glm-4-9b" + _torch_compile_test_revision = "refs/pr/15" + + def setUp(self): + self.model_tester = GlmModelTester(self) + self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_Glm_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + print(config) + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_token_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) + model = self.model_tester.for_token_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=token_labels) + self.assertEqual( + result.logits.shape, + (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), + ) + + @unittest.skip(reason="Glm uses GQA on all models so the KV cache is a non standard format") + def test_past_key_values_format(self): + pass + + @is_flaky() + def test_custom_4d_attention_mask(self): + """Overwrite the common test to use atol=1e-3 instead of 1e-4. Can still rarely fail, thus flaky.""" + for model_class in self.all_generative_model_classes: + if not model_class._supports_static_cache: + self.skipTest(f"{model_class.__name__} is not guaranteed to work with custom 4D attention masks") + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: + self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward(input_ids, position_ids=position_ids).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids_shared_prefix, + attention_mask=mask_shared_prefix, + position_ids=position_ids_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = torch.nn.functional.softmax(out_last_tokens) + normalized_1 = torch.nn.functional.softmax(out_shared_prefix_last_tokens) + print(torch.abs(normalized_0 - normalized_1).max()) + + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) + + @require_flash_attn + @require_torch_gpu + @pytest.mark.flash_attn_test + @slow + def test_flash_attn_2_generate_padding_right(self): + """Overwrite the common test as the test is flaky on tiny models.""" + model = GlmForCausalLM.from_pretrained( + "THUDM/glm-4-9b", + device_map={"": 0}, + torch_dtype=torch.bfloat16, + revision="refs/pr/15", + ) + + tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15") + tokenizer.padding_side = "right" + + texts = ["hi", "Hello this is a very long sentence"] + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) + + output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False) + output_native = tokenizer.batch_decode(output_native) + + model = GlmForCausalLM.from_pretrained( + "THUDM/glm-4-9b", + device_map={"": 0}, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16, + revision="refs/pr/15", + ) + + output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False) + output_fa_2 = tokenizer.batch_decode(output_fa_2) + + self.assertListEqual(output_native, output_fa_2) + + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + @is_flaky + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + """Overwrite to add flakyness: some cases can sometimes fail""" + if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): + self.skipTest( + f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" + ) + + # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. + if torch_dtype == "float16": + torch_dtype = torch.float16 + elif torch_dtype == "bfloat16": + torch_dtype = torch.bfloat16 + elif torch_dtype == "float32": + torch_dtype = torch.float32 + + atols = { + ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-6, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-6, + ("cuda", True, torch.bfloat16): 1e-2, + ("cuda", True, torch.float16): 5e-3, + } + rtols = { + ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-4, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-4, + ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.float16): 5e-3, + } + + def get_mean_reldiff(failcase, x, ref, atol, rtol): + return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors. + # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask. + # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code. + # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it. + deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters + + is_encoder_decoder = model.config.is_encoder_decoder + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch_dtype, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model, + # but it would be nicer to have an efficient way to use parameterized.expand + fail_cases = [] + for padding_side in ["left", "right"]: + for use_mask in [False, True]: + for output_attentions in [True, False]: + can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters + if not (self.has_attentions and can_output_attn) and output_attentions: + continue + for batch_size in [1, 5]: + dummy_input = inputs_dict[model.main_input_name] + + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + dummy_input = dummy_input.to(torch_dtype) + + dummy_input = dummy_input[:batch_size] + if dummy_input.shape[0] != batch_size: + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + extension = torch.rand( + batch_size - dummy_input.shape[0], + *dummy_input.shape[1:], + dtype=torch_dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + else: + extension = torch.randint( + high=5, + size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]), + dtype=dummy_input.dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + + if not use_mask: + dummy_attention_mask = None + else: + dummy_attention_mask = inputs_dict.get("attention_mask", None) + if dummy_attention_mask is None: + if is_encoder_decoder: + seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] + else: + seqlen = dummy_input.shape[-1] + dummy_attention_mask = ( + torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) + ) + + dummy_attention_mask = dummy_attention_mask[:batch_size] + if dummy_attention_mask.shape[0] != batch_size: + extension = torch.ones( + batch_size - dummy_attention_mask.shape[0], + *dummy_attention_mask.shape[1:], + dtype=dummy_attention_mask.dtype, + device=torch_device, + ) + dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) + dummy_attention_mask = dummy_attention_mask.to(torch_device) + + dummy_attention_mask[:] = 1 + if padding_side == "left": + dummy_attention_mask[-1, :-1] = 1 + dummy_attention_mask[-1, -4:] = 0 + elif padding_side == "right": + dummy_attention_mask[-1, 1:] = 1 + dummy_attention_mask[-1, :3] = 0 + + for enable_kernels in [False, True]: + failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" + if is_encoder_decoder: + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[ + :batch_size + ] + if decoder_input_ids.shape[0] != batch_size: + extension = torch.ones( + batch_size - decoder_input_ids.shape[0], + *decoder_input_ids.shape[1:], + dtype=decoder_input_ids.dtype, + device=torch_device, + ) + decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0) + decoder_input_ids = decoder_input_ids.to(torch_device) + + # TODO: never an `attention_mask` arg here? + processed_inputs = { + model.main_input_name: dummy_input, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + else: + processed_inputs = { + model.main_input_name: dummy_input, + "output_hidden_states": True, + } + + # Otherwise fails for e.g. WhisperEncoderModel + if "attention_mask" in inspect.signature(model_eager.forward).parameters: + processed_inputs["attention_mask"] = dummy_attention_mask + + if ( + self.has_attentions + and "output_attentions" in inspect.signature(model_sdpa.forward).parameters + ): + processed_inputs["output_attentions"] = output_attentions + if not deactivate_mask and ( + "bool_masked_pos" in inspect.signature(model_eager.forward).parameters + ): + dummy_mask = torch.ones((self.model_tester.num_masks,)) + + # In case of additional token (like class) we define a custom `mask_length` + if hasattr(self.model_tester, "mask_length"): + mask_length = self.model_tester.mask_length - dummy_mask.size(0) + else: + mask_length = self.model_tester.seq_length - dummy_mask.size(0) + dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)]) + dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool() + processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device) + + if "noise" in inspect.signature(model_eager.forward).parameters: + np.random.seed(2) + num_patches = int( + (self.model_tester.image_size // self.model_tester.patch_size) ** 2 + ) + noise = np.random.uniform(size=(batch_size, num_patches)) + processed_inputs["noise"] = torch.from_numpy(noise) + + # TODO: test gradients as well (& for FA2 as well!) + with torch.no_grad(): + with torch.backends.cuda.sdp_kernel( + enable_flash=enable_kernels, + enable_math=True, + enable_mem_efficient=enable_kernels, + ): + prepared_inputs = self._prepare_for_class(processed_inputs, model_class) + outputs_eager = model_eager(**prepared_inputs) + outputs_sdpa = model_sdpa(**prepared_inputs) + + logits_eager = ( + outputs_eager.hidden_states[-1] + if not is_encoder_decoder + else outputs_eager.decoder_hidden_states[-1] + ) + logits_sdpa = ( + outputs_sdpa.hidden_states[-1] + if not is_encoder_decoder + else outputs_sdpa.decoder_hidden_states[-1] + ) + + if torch_device in ["cpu", "cuda"]: + atol = atols[torch_device, enable_kernels, torch_dtype] + rtol = rtols[torch_device, enable_kernels, torch_dtype] + else: + atol = 1e-7 + rtol = 1e-4 + + # Masked tokens output slightly deviates - we don't mind that. + if use_mask: + if padding_side == "left": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, :-4] + sub_eager = logits_eager[-1, :-4] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, -4:] + # sub_eager = logits_eager[-1, -4:] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + elif padding_side == "right": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, 3:] + sub_eager = logits_eager[-1, 3:] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, :3] + # sub_eager = logits_eager[-1, :3] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + + else: + if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) + + self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) + + @require_torch_sdpa + @slow + @is_flaky() + def test_eager_matches_sdpa_generate(self): + """Overwrite to add flakyness: outputs sometimes start to diverge after some tokens""" + + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + + +@slow +@require_torch_accelerator +class GlmIntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + model_id = "THUDM/glm-4-9b" + revision = "refs/pr/15" + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def test_model_9b_fp16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_bf16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_eager(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="eager", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_torch_sdpa + def test_model_9b_sdpa(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="sdpa", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_flash_attn + @pytest.mark.flash_attn_test + def test_model_9b_flash_attn(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index da33bbb48c5a36..104923957568aa 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4938,14 +4938,17 @@ def test_torch_compile(self): if not hasattr(self, "_torch_compile_test_ckpt"): self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") ckpt = self._torch_compile_test_ckpt + revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision os.environ["TOKENIZERS_PARALLELISM"] = "false" batch_size = 1 n_iter = 3 - tokenizer = AutoTokenizer.from_pretrained(ckpt) - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(ckpt, revision=revision) + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) model.generation_config.max_new_tokens = 4 @@ -5013,11 +5016,14 @@ def test_compile_cuda_graph_time(self): if not hasattr(self, "_torch_compile_test_ckpt"): self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") ckpt = self._torch_compile_test_ckpt + revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision os.environ["TOKENIZERS_PARALLELISM"] = "false" - tokenizer = AutoTokenizer.from_pretrained(ckpt) - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(ckpt, revision=revision) + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) cache_implementation = "static" if model.config.model_type == "gemma2": From e46e3bc173f472160ee5f6813520806df4762c4a Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 18 Oct 2024 16:54:58 +0100 Subject: [PATCH 052/385] Fix UDOP dtype issue (#34180) * Trigger UDOP tests * Try forcing dtype in LayoutLMV3 * Do checks to see where uint8 is getting in * Do checks to see where uint8 is getting in * Found it! * Add .astype(np.float32) * Remove forced check, make fixup * Checking where exactly the uint8 creeps in * More checking on the uint8 issues * Manually upcast in rescale() * Remove UDOP trigger --- src/transformers/image_transforms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 4fef6012012f36..81e8d9185623aa 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -123,11 +123,11 @@ def rescale( if not isinstance(image, np.ndarray): raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") - rescaled_image = image * scale + rescaled_image = image.astype(np.float64) * scale # Numpy type promotion has changed, so always upcast first if data_format is not None: rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format) - rescaled_image = rescaled_image.astype(dtype) + rescaled_image = rescaled_image.astype(dtype) # Finally downcast to the desired dtype at the end return rescaled_image From 816f4424964c1a1631e303b663fc3d68f731e923 Mon Sep 17 00:00:00 2001 From: Matthew Hoffman Date: Fri, 18 Oct 2024 09:15:26 -0700 Subject: [PATCH 053/385] Only cast logits to float when computing loss (#34147) * Only cast logits to float when computing loss Some misses from #31292 and #33902 * Move logits.float() into existing if labels is not None branch --- src/transformers/models/chameleon/modeling_chameleon.py | 3 ++- src/transformers/models/granite/modeling_granite.py | 3 ++- src/transformers/models/granitemoe/modeling_granitemoe.py | 3 ++- src/transformers/models/idefics3/modeling_idefics3.py | 3 ++- src/transformers/models/paligemma/modeling_paligemma.py | 3 ++- src/transformers/models/phimoe/modeling_phimoe.py | 8 +------- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 3 ++- .../models/recurrent_gemma/modeling_recurrent_gemma.py | 3 ++- src/transformers/models/zamba/modeling_zamba.py | 8 +------- 9 files changed, 16 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index d0b964a7a6f484..797908277930cf 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1602,7 +1602,6 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.float() # Disallow image tokens which does not include special begin-image and end-image tokens image_tokens = self.model.vocabulary_mapping.image_tokens @@ -1610,6 +1609,8 @@ def forward( loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index bb8c157df30c89..50c5b538af306c 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -1101,10 +1101,11 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits / self.config.logits_scaling - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index f3e2d67734a703..07b42822621a3e 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1345,10 +1345,11 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits / self.config.logits_scaling - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index fb9f0a7c58fa5a..748eda8c026377 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1210,10 +1210,11 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() labels = labels.to(logits.device) # Shift so that tokens < n predict n if attention_mask is not None: diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 0eb2d50e0ad4c4..1607261eaac673 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -526,9 +526,10 @@ def forward( ) logits = outputs.logits - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() shift_logits = logits[..., :-1, :] shift_labels = labels[..., 1:] if attention_mask is not None: diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 9e24e59c64c2fe..e96eae799cda88 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -38,7 +38,6 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, - is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -1463,13 +1462,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 5464b40546498a..f4cb84a2444eb6 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1760,10 +1760,11 @@ def forward( hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index 17744188d40178..d3164b17fe130c 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -870,9 +870,10 @@ def forward( cap = self.config.logits_soft_cap logits = nn.functional.tanh(logits / cap) * cap - logits = logits.float() loss = None if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 8a61c15e30a0a9..921d07f287dca5 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -51,7 +51,6 @@ from ...utils.import_utils import ( is_causal_conv1d_available, is_mamba_ssm_available, - is_torchdynamo_compiling, ) from .configuration_zamba import ZambaConfig @@ -1473,13 +1472,8 @@ def forward( ) hidden_states = outputs[0] - if labels is None and not is_torchdynamo_compiling(): - logger.warning_once( - "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)" - ) # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - # TODO: remove the float() operation in v4.46 - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float() + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: From ca541bd4f4d932f486a4116deba833b4ffaebd15 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 21 Oct 2024 10:00:14 +0200 Subject: [PATCH 054/385] Generation tests: don't rely on main input name (#34228) * don't rely on main input name * update --- tests/generation/test_utils.py | 69 ++++++++++++------- .../models/reformer/test_modeling_reformer.py | 6 +- .../test_modeling_speech_to_text.py | 8 --- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 6766fa22b9b8a0..996d95eb80ff9b 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -410,7 +410,6 @@ def _contrastive_generate( def test_greedy_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict) @@ -418,7 +417,7 @@ def test_greedy_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) @pytest.mark.generate def test_greedy_generate_dict_outputs(self): @@ -444,7 +443,9 @@ def test_greedy_generate_dict_outputs(self): # Retrocompatibility check self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) @@ -478,7 +479,9 @@ def test_greedy_generate_dict_outputs_use_cache(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self._check_outputs(output_generate, main_input, model.config, use_cache=True) @@ -486,7 +489,6 @@ def test_greedy_generate_dict_outputs_use_cache(self): def test_sample_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1) @@ -494,7 +496,7 @@ def test_sample_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) @pytest.mark.generate def test_sample_generate_dict_output(self): @@ -521,7 +523,9 @@ def test_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, SampleEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) @@ -532,7 +536,6 @@ def test_sample_generate_dict_output(self): def test_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -542,7 +545,7 @@ def test_beam_search_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) @pytest.mark.generate def test_beam_search_generate_dict_output(self): @@ -569,7 +572,9 @@ def test_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) @@ -609,7 +614,9 @@ def test_beam_search_generate_dict_outputs_use_cache(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self._check_outputs( output_generate, @@ -647,7 +654,6 @@ def test_model_parallel_beam_search(self): def test_beam_sample_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -660,7 +666,7 @@ def test_beam_sample_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly # no quick fix available, since obtaining image embeddings step is very model-specific @@ -712,7 +718,9 @@ def test_beam_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) @@ -746,7 +754,6 @@ def test_generate_without_input_ids(self): def test_group_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() # check `generate()` and `group_beam_search()` are equal @@ -759,7 +766,7 @@ def test_group_beam_search_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) # check `group_beam_search` for higher than 1 `num_return_sequences` num_return_sequences = 2 @@ -772,7 +779,7 @@ def test_group_beam_search_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) @pytest.mark.generate def test_group_beam_search_generate_dict_output(self): @@ -799,7 +806,9 @@ def test_group_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) @@ -814,7 +823,6 @@ def test_group_beam_search_generate_dict_output(self): def test_constrained_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -838,7 +846,7 @@ def test_constrained_beam_search_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) for generation_output in output_generate: self._check_sequence_inside_sequence(force_tokens, generation_output) @@ -862,7 +870,7 @@ def test_constrained_beam_search_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) for generation_output in output_generate: self._check_sequence_inside_sequence(force_tokens, generation_output) @@ -903,7 +911,9 @@ def test_constrained_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) @@ -923,7 +933,6 @@ def test_contrastive_generate(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -940,7 +949,7 @@ def test_contrastive_generate(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) @pytest.mark.generate def test_contrastive_generate_dict_outputs_use_cache(self): @@ -975,7 +984,9 @@ def test_contrastive_generate_dict_outputs_use_cache(self): if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: - self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) + self.assertTrue( + output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] + ) self._check_outputs(output_generate, main_input, model.config, use_cache=True) @@ -2035,8 +2046,14 @@ def test_inherits_generation_mixin(self): self.assertTrue("GenerationMixin" in str(model_class.__bases__)) def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image + # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` batch_size = main_input.shape[0] - seq_length = main_input.shape[-1] + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + config = config.text_config if hasattr(config, "text_config") else config num_sequences_in_output = batch_size * num_return_sequences diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py index 774831791fe5aa..25b28477a145ec 100644 --- a/tests/models/reformer/test_modeling_reformer.py +++ b/tests/models/reformer/test_modeling_reformer.py @@ -53,6 +53,7 @@ def __init__( parent, batch_size=13, seq_length=32, + text_seq_length=None, is_training=True, is_decoder=True, use_input_mask=True, @@ -128,6 +129,7 @@ def __init__( self.attn_layers = attn_layers self.pad_token_id = pad_token_id self.hash_seed = hash_seed + self.text_seq_length = text_seq_length or seq_length attn_chunk_length = local_attn_chunk_length if local_attn_chunk_length is not None else lsh_attn_chunk_length num_chunks_after = local_num_chunks_after if local_num_chunks_after is not None else lsh_num_chunks_after @@ -608,7 +610,7 @@ class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, Mod test_sequence_classification_problem_types = True def setUp(self): - self.model_tester = ReformerModelTester(self) + self.model_tester = ReformerModelTester(self, text_seq_length=16) self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37) @slow @@ -689,7 +691,7 @@ def prepare_config_and_inputs_for_generate(self, *args, **kwargs): # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests original_sequence_length = self.model_tester.seq_length - self.model_tester.seq_length = 16 + self.model_tester.seq_length = self.model_tester.text_seq_length test_inputs = super().prepare_config_and_inputs_for_generate(*args, **kwargs) self.model_tester.seq_length = original_sequence_length return test_inputs diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 50446d4628af8c..253cda7e49cb14 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -618,14 +618,6 @@ def test_resize_embeddings_untied(self): def test_generate_without_input_ids(self): pass - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): - # In this model, the index of `batch_size` and `sequence_length`` in `main_input` is different: they are the - # first two dimensions of the tensor. - main_input = main_input[:, :, 0] - super()._check_outputs( - output, main_input, config, use_cache=use_cache, num_return_sequences=num_return_sequences - ) - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: self.skipTest(reason="test_torchscript is set to False") From 24bdc94da5d915afc8c766f8da8acdd509d808ff Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Mon, 21 Oct 2024 08:55:27 -0400 Subject: [PATCH 055/385] Change Paligemma import logging to work with modular (#34211) * change import logging * fix CI --- src/transformers/models/glm/modeling_glm.py | 41 +++---------------- .../models/paligemma/processing_paligemma.py | 4 +- 2 files changed, 7 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 9815dbc78992ed..a458c02a6feda7 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -25,7 +25,6 @@ import torch import torch.nn as nn import torch.utils.checkpoint -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -921,6 +920,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1071,18 +1071,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1186,27 +1175,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1289,8 +1259,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index 6bc95dc2fb8c65..77103a4eabbaf0 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -16,7 +16,6 @@ Processor class for PaliGemma. """ -import logging from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -34,9 +33,10 @@ PreTokenizedInput, TextInput, ) +from ...utils import logging -logger = logging.getLogger(__name__) +logger = logging.get_logger(__name__) IMAGE_TOKEN = "" EXTRA_TOKENS = [f"4}>" for i in range(1024)] + [f"3}>" for i in range(128)] From a4122813d1309385f303f60b59a1f9944fada761 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:05:05 -0400 Subject: [PATCH 056/385] Add DetrImageProcessorFast (#34063) * add fully functionning image_processing_detr_fast * Create tensors on the correct device * fix copies * fix doc * add tests equivalence cpu gpu * fix doc en * add relative imports and copied from * Fix copies and nit --- docs/source/en/model_doc/detr.md | 9 + docs/source/ja/model_doc/detr.md | 9 + src/transformers/__init__.py | 4 +- src/transformers/image_transforms.py | 5 +- .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/detr/__init__.py | 2 + .../models/detr/image_processing_detr_fast.py | 1546 +++++++++++++++++ src/transformers/utils/__init__.py | 1 + .../utils/dummy_vision_objects.py | 7 + src/transformers/utils/import_utils.py | 10 +- .../test_image_processing_conditional_detr.py | 457 ++--- .../test_image_processing_deformable_detr.py | 457 ++--- .../models/detr/test_image_processing_detr.py | 891 ++++++---- .../test_image_processing_grounding_dino.py | 457 ++--- .../yolos/test_image_processing_yolos.py | 89 +- tests/test_image_processing_common.py | 2 +- 16 files changed, 2840 insertions(+), 1108 deletions(-) create mode 100644 src/transformers/models/detr/image_processing_detr_fast.py diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index 0aeaf8e7693773..43c6e6d17e2f70 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -181,6 +181,15 @@ If you're interested in submitting a resource to be included here, please feel f - post_process_instance_segmentation - post_process_panoptic_segmentation +## DetrImageProcessorFast + +[[autodoc]] DetrImageProcessorFast + - preprocess + - post_process_object_detection + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation + ## DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor diff --git a/docs/source/ja/model_doc/detr.md b/docs/source/ja/model_doc/detr.md index 1b9e64eb5486ee..3342b123a01b83 100644 --- a/docs/source/ja/model_doc/detr.md +++ b/docs/source/ja/model_doc/detr.md @@ -184,6 +184,15 @@ DETR の使用を開始するのに役立つ公式 Hugging Face およびコミ - post_process_instance_segmentation - post_process_panoptic_segmentation +## DetrImageProcessorFast + +[[autodoc]] DetrImageProcessorFast + - preprocess + - post_process_object_detection + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation + ## DetrFeatureExtractor [[autodoc]] DetrFeatureExtractor diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a926a848c3b5f1..7f408859c539b0 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1191,7 +1191,7 @@ _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) - _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"]) + _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") @@ -6090,7 +6090,7 @@ from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor - from .models.detr import DetrFeatureExtractor, DetrImageProcessor + from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor from .models.efficientnet import EfficientNetImageProcessor diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 81e8d9185623aa..e7d3a5abb7a8db 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -32,6 +32,7 @@ is_tf_available, is_torch_available, is_torchvision_available, + is_torchvision_v2_available, is_vision_available, requires_backends, ) @@ -51,7 +52,9 @@ if is_flax_available(): import jax.numpy as jnp -if is_torchvision_available(): +if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F +elif is_torchvision_available(): from torchvision.transforms import functional as F diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index ef40798484ef41..d181afeb2d4d0d 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -72,7 +72,7 @@ ("deit", ("DeiTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)), ("deta", ("DetaImageProcessor",)), - ("detr", ("DetrImageProcessor",)), + ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinov2", ("BitImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)), diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py index 422fe98230be45..cc6687ff8bb4b4 100644 --- a/src/transformers/models/detr/__init__.py +++ b/src/transformers/models/detr/__init__.py @@ -27,6 +27,7 @@ else: _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"] _import_structure["image_processing_detr"] = ["DetrImageProcessor"] + _import_structure["image_processing_detr_fast"] = ["DetrImageProcessorFast"] try: if not is_torch_available(): @@ -53,6 +54,7 @@ else: from .feature_extraction_detr import DetrFeatureExtractor from .image_processing_detr import DetrImageProcessor + from .image_processing_detr_fast import DetrImageProcessorFast try: if not is_torch_available(): diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py new file mode 100644 index 00000000000000..97940ab3132dda --- /dev/null +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -0,0 +1,1546 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DETR.""" + +import functools +import io +import pathlib +from collections import defaultdict +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_transforms import ( + center_to_corners_format, + corners_to_center_format, + id_to_rgb, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_size, + get_image_type, + infer_channel_dimension_format, + make_list_of_images, + pil_torch_interpolation_mapping, + validate_annotations, + validate_kwargs, +) +from ...utils import ( + TensorType, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + is_vision_available, + logging, +) +from .image_processing_detr import ( + compute_segments, + convert_segmentation_to_rle, + get_size_with_aspect_ratio, + max_across_indices, + remove_low_and_no_objects, +) + + +if is_torch_available(): + import torch + from torch import nn + +if is_vision_available(): + import PIL + + +if is_torchvision_available(): + from torchvision.io import read_image + + from ...image_utils import pil_torch_interpolation_mapping + + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +logger = logging.get_logger(__name__) + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +def get_image_size_for_max_height_width( + image_size: Tuple[int, int], + max_height: int, + max_width: int, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + image_size (`Tuple[int, int]`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + """ + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def safe_squeeze(tensor: torch.Tensor, axis: Optional[int] = None) -> torch.Tensor: + """ + Squeezes a tensor, but only if the axis specified has dim 1. + """ + if axis is None: + return tensor.squeeze() + + try: + return tensor.squeeze(axis=axis) + except ValueError: + return tensor + + +def get_max_height_width(images: List[torch.Tensor]) -> Tuple[int]: + """ + Get the maximum height and width across all images in a batch. + """ + + _, max_height, max_width = max_across_indices([img.shape for img in images]) + + return (max_height, max_width) + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 +def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`List[List[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8, device=device) + mask = torch.any(mask, axis=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, axis=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device) + + return masks + + +# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50 +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + + # for conversion to coco api + area = torch.as_tensor([obj["area"] for obj in annotations], dtype=torch.float32, device=image.device) + iscrowd = torch.as_tensor( + [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=torch.int64, device=image.device + ) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = torch.as_tensor( + [int(image_height), int(image_width)], dtype=torch.int64, device=image.device + ) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + y = torch.arange(0, h, dtype=torch.float32, device=masks.device) + x = torch.arange(0, w, dtype=torch.float32, device=masks.device) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = torch.meshgrid(y, x, indexing="ij") + + x_mask = masks * torch.unsqueeze(x, 0) + x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0] + x_min = ( + torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + y_mask = masks * torch.unsqueeze(y, 0) + y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0] + y_min = ( + torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + return torch.stack([x_min, y_min, x_max, y_max], 1) + + +# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, torch.Tensor) and len(color.shape) == 3: + if color.dtype == torch.uint8: + color = color.to(torch.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def prepare_coco_panoptic_annotation( + image: torch.Tensor, + target: Dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> Dict: + """ + Prepare a coco panoptic annotation for DETR. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = torch.as_tensor( + [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device + ) + new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + + if "segments_info" in target: + masks = read_image(annotation_path).permute(1, 2, 0).to(torch.int32).to(image.device) + masks = rgb_to_id(masks) + + ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device) + masks = masks == ids[:, None, None] + masks = masks.to(torch.bool) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = torch.as_tensor( + [segment_info["category_id"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["iscrowd"] = torch.as_tensor( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["area"] = torch.as_tensor( + [segment_info["area"] for segment_info in target["segments_info"]], + dtype=torch.float32, + device=image.device, + ) + + return new_target + + +class DetrImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a fast Detr image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: [Union[PILImageResampling, F.InterpolationMode]] = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: torch.Tensor, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into DETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def pad( + self, + image: torch.Tensor, + padded_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @functools.lru_cache(maxsize=1) + def _validate_input_arguments( + self, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + do_resize: bool, + size: Dict[str, int], + resample: "PILImageResampling", + data_format: Union[str, ChannelDimension], + return_tensors: Union[TensorType, str], + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + device = kwargs.pop("device", None) + + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + self._validate_input_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + return_tensors=return_tensors, + data_format=data_format, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + data = {} + if image_type == ImageType.PIL: + images = [F.pil_to_tensor(image) for image in images] + elif image_type == ImageType.NUMPY: + # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays + images = [torch.from_numpy(image).contiguous() for image in images] + + if device is not None: + images = [image.to(device) for image in images] + + # We assume that all images have the same channel dimension format. + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + if input_data_format == ChannelDimension.LAST: + images = [image.permute(2, 0, 1).contiguous() for image in images] + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + if do_resize: + if isinstance(resample, (PILImageResampling, int)): + interpolation = pil_torch_interpolation_mapping[resample] + else: + interpolation = resample + resized_images = [self.resize(image, size=size, interpolation=interpolation) for image in images] + if annotations is not None: + for i, (image, target) in enumerate(zip(resized_images, annotations)): + annotations[i] = self.resize_annotation( + target, + orig_size=images[i].size()[-2:], + target_size=image.size()[-2:], + ) + images = resized_images + del resized_images + + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + images = [F.normalize(image.to(dtype=torch.float32), new_mean, new_std) for image in images] + elif do_rescale: + images = [image * rescale_factor for image in images] + elif do_normalize: + images = [F.normalize(image, image_mean, image_std) for image in images] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + pixel_masks = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + padded_image, pixel_mask, padded_annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(padded_image) + pixel_masks.append(pixel_mask) + padded_annotations.append(padded_annotation) + images = padded_images + if annotations is not None: + annotations = padded_annotations + del padded_images, padded_annotations + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_segmentation + def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. + threshold (`float`, *optional*, defaults to 0.9): + Threshold to use to filter out queries. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_semantic_segmentation`.", + ) + out_logits, raw_masks = outputs.logits, outputs.pred_masks + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 + + predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks} + preds.append(predictions) + return preds + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance + def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): + """ + Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports + PyTorch. + + Args: + results (`List[Dict]`): + Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added. + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original + image size (before any data augmentation). + max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). + threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an + image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_instance_segmentation`.", + ) + + if len(orig_target_sizes) != len(max_target_sizes): + raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs.pred_masks.squeeze(2) + outputs_masks = nn.functional.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) + outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = nn.functional.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic + def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): + """ + Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. + + Args: + outputs ([`DetrSegmentationOutput`]): + Raw outputs of the model. + processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): + Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data + augmentation but before batching. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*): + Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction. + If left to None, it will default to the `processed_sizes`. + is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): + Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. + If not set, defaults to the `is_thing_map` of COCO panoptic. + threshold (`float`, *optional*, defaults to 0.85): + Threshold to use to filter out queries. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for + an image in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_panoptic_segmentation`.", + ) + if target_sizes is None: + target_sizes = processed_sizes + if len(processed_sizes) != len(target_sizes): + raise ValueError("Make sure to pass in as many processed_sizes as target_sizes") + + if is_thing_map is None: + # default to is_thing_map of COCO panoptic + is_thing_map = {i: i <= 90 for i in range(201)} + + out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes + if not len(out_logits) == len(raw_masks) == len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" + ) + empty_label = out_logits.shape[-1] - 1 + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + cur_scores, cur_labels = cur_logits.softmax(-1).max(-1) + keep = cur_labels.ne(empty_label) & (cur_scores > threshold) + cur_scores = cur_scores[keep] + cur_labels = cur_labels[keep] + cur_masks = cur_masks[keep] + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = center_to_corners_format(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + if len(cur_boxes) != len(cur_labels): + raise ValueError("Not as many boxes as there are classes") + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_labels): + if not is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST) + + np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + np_seg_img = np_seg_img.view(final_h, final_w, 3) + np_seg_img = np_seg_img.numpy() + + m_id = torch.from_numpy(rgb_to_id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_labels.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_labels = cur_labels[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_labels[i].item() + segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a}) + del cur_labels + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(out_bbox) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation + def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None): + """ + Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + target_sizes (`List[Tuple[int, int]]`, *optional*): + A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the + batch. If unset, predictions will not be resized. + Returns: + `List[torch.Tensor]`: + A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) + corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each + `torch.Tensor` correspond to a semantic class id. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Semantic segmentation logits of shape (batch_size, num_classes, height, width) + segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + batch_size = class_queries_logits.shape[0] + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + semantic_segmentation = [] + for idx in range(batch_size): + resized_logits = nn.functional.interpolate( + segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = segmentation.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation + def post_process_instance_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + target_sizes: Optional[List[Tuple[int, int]]] = None, + return_coco_annotation: Optional[bool] = False, + ) -> List[Dict]: + """ + Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + target_sizes (`List[Tuple]`, *optional*): + List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction. If unset, predictions will not be resized. + return_coco_annotation (`bool`, *optional*): + Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) + format. + Returns: + `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to + `True`. Set to `None` if no mask if found above `threshold`. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- An integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: List[Dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=[], + target_size=target_size, + ) + + # Return segmentation map in run-length encoding (RLE) format + if return_coco_annotation: + segmentation = convert_segmentation_to_rle(segmentation) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic_segmentation + def post_process_panoptic_segmentation( + self, + outputs, + threshold: float = 0.5, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[Set[int]] = None, + target_sizes: Optional[List[Tuple[int, int]]] = None, + ) -> List[Dict]: + """ + Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports + PyTorch. + + Args: + outputs ([`DetrForSegmentation`]): + The outputs from [`DetrForSegmentation`]. + threshold (`float`, *optional*, defaults to 0.5): + The probability score threshold to keep predicted instance masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8): + The overlap mask area threshold to merge or discard small disconnected parts within each binary + instance mask. + label_ids_to_fuse (`Set[int]`, *optional*): + The labels in this state will have all their instances be fused together. For instance we could say + there can only be one sky in an image, but several persons, so the label ID for sky would be in that + set, but not the one for person. + target_sizes (`List[Tuple]`, *optional*): + List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested + final size (height, width) of each prediction in batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: + - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or + `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to + the corresponding `target_sizes` entry. + - **segments_info** -- A dictionary that contains additional information on each segment. + - **id** -- an integer representing the `segment_id`. + - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`. + - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise. + Multiple instances of the same class / label were fused and assigned a single `segment_id`. + - **score** -- Prediction score of segment with `segment_id`. + """ + + if label_ids_to_fuse is None: + logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.") + label_ids_to_fuse = set() + + class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1] + masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + # Predicted label and score of each query (batch_size, num_queries) + pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1) + + # Loop over items in batch size + results: List[Dict[str, TensorType]] = [] + + for i in range(batch_size): + mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects( + mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels + ) + + # No mask found + if mask_probs_item.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + # Get segmentation map and segment information of batch item + target_size = target_sizes[i] if target_sizes is not None else None + segmentation, segments = compute_segments( + mask_probs=mask_probs_item, + pred_scores=pred_scores_item, + pred_labels=pred_labels_item, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + label_ids_to_fuse=label_ids_to_fuse, + target_size=target_size, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 2876eef9ea02df..a781389c2fbdc8 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -225,6 +225,7 @@ is_torchdynamo_available, is_torchdynamo_compiling, is_torchvision_available, + is_torchvision_v2_available, is_training_run_on_sagemaker, is_uroman_available, is_vision_available, diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d2ccaeaaed23a8..d7f87717ca834a 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -191,6 +191,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class DetrImageProcessorFast(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class DonutFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index ed95e4b5570e68..173aee9b1ac739 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -186,7 +186,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _torchaudio_available = _is_package_available("torchaudio") _torchao_available = _is_package_available("torchao") _torchdistx_available = _is_package_available("torchdistx") -_torchvision_available = _is_package_available("torchvision") +_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True) _mlx_available = _is_package_available("mlx") _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True) _tiktoken_available = _is_package_available("tiktoken") @@ -362,6 +362,14 @@ def is_torchvision_available(): return _torchvision_available +def is_torchvision_v2_available(): + if not is_torchvision_available(): + return False + + # NOTE: We require torchvision>=0.15 as v2 transforms are available from this version: https://pytorch.org/vision/stable/transforms.html#v1-or-v2-which-one-should-i-use + return version.parse(_torchvision_version) >= version.parse("0.15") + + def is_galore_torch_available(): return _galore_torch_available diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 99a06613e141bb..32b135bcd220bd 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -282,96 +282,97 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = ConditionalDetrImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.4130, 0.2765, 0.0453, 0.2215], - [0.1272, 0.2016, 0.1561, 0.0940], - [0.3757, 0.4933, 0.7488, 0.9865], - [0.3759, 0.5002, 0.7492, 0.9955], - [0.1971, 0.5456, 0.3532, 0.8646], - [0.5790, 0.4115, 0.3430, 0.7161], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr def test_batched_coco_panoptic_annotations(self): @@ -402,146 +403,148 @@ def test_batched_coco_panoptic_annotations(self): images = [image_0, image_1] annotations = [annotation_0, annotation_1] - # encode them - image_processing = ConditionalDetrImageProcessor(format="coco_panoptic") - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_tensors="pt", - return_segmentation_masks=True, - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.2625, 0.5437, 0.4688, 0.8625], - [0.7719, 0.4104, 0.4531, 0.7125], - [0.5000, 0.4927, 0.9969, 0.9854], - [0.1688, 0.2000, 0.2063, 0.0917], - [0.5492, 0.2760, 0.0578, 0.2187], - [0.4992, 0.4990, 0.9984, 0.9979], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.1576, 0.3262, 0.2814, 0.5175], - [0.4634, 0.2463, 0.2720, 0.4275], - [0.3002, 0.2956, 0.5985, 0.5913], - [0.1013, 0.1200, 0.1238, 0.0550], - [0.3297, 0.1656, 0.0347, 0.1312], - [0.2997, 0.2994, 0.5994, 0.5987], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr def test_max_width_max_height_resizing_and_pad_strategy(self): - image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) - - # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 - image_processor = ConditionalDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) - - # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 - image_processor = ConditionalDetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - - # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 - image_processor = ConditionalDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) - - # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 - image_processor = ConditionalDetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=True, - pad_size={"height": 301, "width": 101}, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) - - ### Check for batch - image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) - - # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 - image_processor = ConditionalDetrImageProcessor( - size={"max_height": 150, "max_width": 100}, - do_pad=True, - pad_size={"height": 150, "width": 100}, - ) - inputs = image_processor(images=[image_1, image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) def test_longest_edge_shortest_edge_resizing_strategy(self): image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 41e5a81e2f93c0..29dd0556afcde1 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -284,96 +284,97 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = DeformableDetrImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.4130, 0.2765, 0.0453, 0.2215], - [0.1272, 0.2016, 0.1561, 0.0940], - [0.3757, 0.4933, 0.7488, 0.9865], - [0.3759, 0.5002, 0.7492, 0.9955], - [0.1971, 0.5456, 0.3532, 0.8646], - [0.5790, 0.4115, 0.3430, 0.7161], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr def test_batched_coco_panoptic_annotations(self): @@ -404,146 +405,148 @@ def test_batched_coco_panoptic_annotations(self): images = [image_0, image_1] annotations = [annotation_0, annotation_1] - # encode them - image_processing = DeformableDetrImageProcessor(format="coco_panoptic") - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_tensors="pt", - return_segmentation_masks=True, - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.2625, 0.5437, 0.4688, 0.8625], - [0.7719, 0.4104, 0.4531, 0.7125], - [0.5000, 0.4927, 0.9969, 0.9854], - [0.1688, 0.2000, 0.2063, 0.0917], - [0.5492, 0.2760, 0.0578, 0.2187], - [0.4992, 0.4990, 0.9984, 0.9979], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.1576, 0.3262, 0.2814, 0.5175], - [0.4634, 0.2463, 0.2720, 0.4275], - [0.3002, 0.2956, 0.5985, 0.5913], - [0.1013, 0.1200, 0.1238, 0.0550], - [0.3297, 0.1656, 0.0347, 0.1312], - [0.2997, 0.2994, 0.5994, 0.5987], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr def test_max_width_max_height_resizing_and_pad_strategy(self): - image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) - - # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 - image_processor = DeformableDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) - - # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 - image_processor = DeformableDetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - - # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 - image_processor = DeformableDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) - - # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 - image_processor = DeformableDetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=True, - pad_size={"height": 301, "width": 101}, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) - - ### Check for batch - image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) - - # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 - image_processor = DeformableDetrImageProcessor( - size={"max_height": 150, "max_width": 100}, - do_pad=True, - pad_size={"height": 150, "width": 100}, - ) - inputs = image_processor(images=[image_1, image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) def test_longest_edge_shortest_edge_resizing_strategy(self): image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 4174df0f8cc792..976b306115b68a 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -19,8 +19,8 @@ import numpy as np -from transformers.testing_utils import require_torch, require_vision, slow -from transformers.utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs @@ -33,6 +33,9 @@ from transformers import DetrImageProcessor + if is_torchvision_available(): + from transformers import DetrImageProcessorFast + class DetrImageProcessingTester(unittest.TestCase): def __init__( @@ -51,6 +54,7 @@ def __init__( image_std=[0.5, 0.5, 0.5], do_pad=True, ): + super().__init__() # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} self.parent = parent @@ -132,6 +136,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_vision class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): image_processing_class = DetrImageProcessor if is_vision_available() else None + fast_image_processing_class = DetrImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -142,26 +147,28 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "image_mean")) - self.assertTrue(hasattr(image_processing, "image_std")) - self.assertTrue(hasattr(image_processing, "do_normalize")) - self.assertTrue(hasattr(image_processing, "do_rescale")) - self.assertTrue(hasattr(image_processing, "rescale_factor")) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "do_pad")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_pad")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) - self.assertEqual(image_processor.do_pad, True) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) - image_processor = self.image_processing_class.from_dict( - self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False - ) - self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) - self.assertEqual(image_processor.do_pad, False) + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) def test_should_raise_if_annotation_format_invalid(self): image_processor_dict = self.image_processor_tester.prepare_image_processor_dict() @@ -178,12 +185,13 @@ def test_should_raise_if_annotation_format_invalid(self): } image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}} - image_processor = self.image_processing_class(**image_processor_params) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**image_processor_params) - with self.assertRaises(ValueError) as e: - image_processor(**params) + with self.assertRaises(ValueError) as e: + image_processor(**params) - self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat")) + self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat")) def test_valid_coco_detection_annotations(self): # prepare image and target @@ -193,32 +201,33 @@ def test_valid_coco_detection_annotations(self): params = {"image_id": 39769, "annotations": target} - # encode them - image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50") - # legal encodings (single image) - _ = image_processing(images=image, annotations=params, return_tensors="pt") - _ = image_processing(images=image, annotations=[params], return_tensors="pt") + # legal encodings (single image) + _ = image_processing(images=image, annotations=params, return_tensors="pt") + _ = image_processing(images=image, annotations=[params], return_tensors="pt") - # legal encodings (batch of one image) - _ = image_processing(images=[image], annotations=params, return_tensors="pt") - _ = image_processing(images=[image], annotations=[params], return_tensors="pt") + # legal encodings (batch of one image) + _ = image_processing(images=[image], annotations=params, return_tensors="pt") + _ = image_processing(images=[image], annotations=[params], return_tensors="pt") - # legal encoding (batch of more than one image) - n = 5 - _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") + # legal encoding (batch of more than one image) + n = 5 + _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") - # example of an illegal encoding (missing the 'image_id' key) - with self.assertRaises(ValueError) as e: - image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") + # example of an illegal encoding (missing the 'image_id' key) + with self.assertRaises(ValueError) as e: + image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") - self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) + self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) - # example of an illegal encoding (unequal lengths of images and annotations) - with self.assertRaises(ValueError) as e: - image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") + # example of an illegal encoding (unequal lengths of images and annotations) + with self.assertRaises(ValueError) as e: + image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") - self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") + self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") @slow def test_call_pytorch_with_coco_detection_annotations(self): @@ -229,40 +238,41 @@ def test_call_pytorch_with_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} - # encode them - image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") - encoding = image_processing(images=image, annotations=target, return_tensors="pt") - - # verify pixel values - expected_shape = torch.Size([1, 3, 800, 1066]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) - - # verify area - expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) - self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) - # verify boxes - expected_boxes_shape = torch.Size([6, 4]) - self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) - expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) - # verify image_id - expected_image_id = torch.tensor([39769]) - self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) - # verify is_crowd - expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) - # verify class_labels - expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) - self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) - # verify orig_size - expected_orig_size = torch.tensor([480, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) - # verify size - expected_size = torch.tensor([800, 1066]) - self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50") + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_call_pytorch_with_coco_panoptic_annotations(self): @@ -275,43 +285,45 @@ def test_call_pytorch_with_coco_panoptic_annotations(self): masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") - # encode them - image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic") - encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") - - # verify pixel values - expected_shape = torch.Size([1, 3, 800, 1066]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) - - # verify area - expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) - self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) - # verify boxes - expected_boxes_shape = torch.Size([6, 4]) - self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) - expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) - # verify image_id - expected_image_id = torch.tensor([39769]) - self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) - # verify is_crowd - expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) - # verify class_labels - expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) - self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) - # verify masks - expected_masks_sum = 822873 - self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) - # verify orig_size - expected_orig_size = torch.tensor([480, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) - # verify size - expected_size = torch.tensor([800, 1066]) - self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50-panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify masks + expected_masks_sum = 822873 + relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum + self.assertTrue(relative_error < 1e-3) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_batched_coco_detection_annotations(self): @@ -340,96 +352,97 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = DetrImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.4130, 0.2765, 0.0453, 0.2215], - [0.1272, 0.2016, 0.1561, 0.0940], - [0.3757, 0.4933, 0.7488, 0.9865], - [0.3759, 0.5002, 0.7492, 0.9955], - [0.1971, 0.5456, 0.3532, 0.8646], - [0.5790, 0.4115, 0.3430, 0.7161], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) def test_batched_coco_panoptic_annotations(self): # prepare image, target and masks_path @@ -459,194 +472,318 @@ def test_batched_coco_panoptic_annotations(self): images = [image_0, image_1] annotations = [annotation_0, annotation_1] - # encode them - image_processing = DetrImageProcessor(format="coco_panoptic") - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_tensors="pt", - return_segmentation_masks=True, - ) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.2625, 0.5437, 0.4688, 0.8625], - [0.7719, 0.4104, 0.4531, 0.7125], - [0.5000, 0.4927, 0.9969, 0.9854], - [0.1688, 0.2000, 0.2063, 0.0917], - [0.5492, 0.2760, 0.0578, 0.2187], - [0.4992, 0.4990, 0.9984, 0.9979], - ] + def test_max_width_max_height_resizing_and_pad_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = image_processing_class( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = image_processing_class( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = image_processing_class( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) + + @slow + @require_torch_gpu + def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50") + # 1. run processor on CPU + encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") + # 2. run processor on GPU + encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda") + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) ) - expected_boxes_1 = torch.tensor( - [ - [0.1576, 0.3262, 0.2814, 0.5175], - [0.4634, 0.2463, 0.2720, 0.4275], - [0.3002, 0.2956, 0.5985, 0.5913], - [0.1013, 0.1200, 0.1238, 0.0550], - [0.3297, 0.1656, 0.0347, 0.1312], - [0.2997, 0.2994, 0.5994, 0.5987], - ] + # verify area + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu"))) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", + # verify image_id + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu")) ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + # verify is_crowd + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu")) + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify orig_size + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu")) + ) + # verify size + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))) - def test_max_width_max_height_resizing_and_pad_strategy(self): - image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + @slow + @require_torch_gpu + def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) - # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 - image_processor = DetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} - # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 - image_processor = DetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") - # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 - image_processor = DetrImageProcessor( - size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic") + # 1. run processor on CPU + encoding_cpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu" ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) - - # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 - image_processor = DetrImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=True, - pad_size={"height": 301, "width": 101}, + # 2. run processor on GPU + encoding_gpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda" ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) - ### Check for batch - image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) - - # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 - image_processor = DetrImageProcessor( - size={"max_height": 150, "max_width": 100}, - do_pad=True, - pad_size={"height": 150, "width": 100}, + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) ) - inputs = image_processor(images=[image_1, image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) - - def test_longest_edge_shortest_edge_resizing_strategy(self): - image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) - - # max size is set; width < height; - # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 - image_processor = DetrImageProcessor( - size={"longest_edge": 640, "shortest_edge": 640}, - do_pad=False, + # verify area + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu"))) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) - - image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) - # max size is set; height < width; - # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 - image_processor = DetrImageProcessor( - size={"longest_edge": 640, "shortest_edge": 640}, - do_pad=False, + # verify image_id + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu")) ) - inputs = image_processor(images=[image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) - - image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) - # max size is set; width == size; height > max_size; - # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 - image_processor = DetrImageProcessor( - size={"longest_edge": 118, "shortest_edge": 100}, - do_pad=False, + # verify is_crowd + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu")) ) - inputs = image_processor(images=[image_3], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) - - image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) - # max size is set; height == size; width < max_size; - # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 - image_processor = DetrImageProcessor( - size={"longest_edge": 256, "shortest_edge": 50}, - do_pad=False, + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) ) - inputs = image_processor(images=[image_4], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) - - image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) - # max size is set; height == width; width < max_size; - # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 - image_processor = DetrImageProcessor( - size={"longest_edge": 117, "shortest_edge": 50}, - do_pad=False, + # verify masks + masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum() + masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum() + relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu + self.assertTrue(relative_error < 1e-3) + # verify orig_size + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu")) ) - inputs = image_processor(images=[image_5], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) + # verify size + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 5a28397847079f..fc622ead7a711b 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -269,96 +269,97 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = GroundingDinoImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.4130, 0.2765, 0.0453, 0.2215], - [0.1272, 0.2016, 0.1561, 0.0940], - [0.3757, 0.4933, 0.7488, 0.9865], - [0.3759, 0.5002, 0.7492, 0.9955], - [0.1971, 0.5456, 0.3532, 0.8646], - [0.5790, 0.4115, 0.3430, 0.7161], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) @slow # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino @@ -440,146 +441,148 @@ def test_batched_coco_panoptic_annotations(self): images = [image_0, image_1] annotations = [annotation_0, annotation_1] - # encode them - image_processing = GroundingDinoImageProcessor(format="coco_panoptic") - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_tensors="pt", - return_segmentation_masks=True, - ) - - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 800, 1066 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.2625, 0.5437, 0.4688, 0.8625], - [0.7719, 0.4104, 0.4531, 0.7125], - [0.5000, 0.4927, 0.9969, 0.9854], - [0.1688, 0.2000, 0.2063, 0.0917], - [0.5492, 0.2760, 0.0578, 0.2187], - [0.4992, 0.4990, 0.9984, 0.9979], - ] - ) - expected_boxes_1 = torch.tensor( - [ - [0.1576, 0.3262, 0.2814, 0.5175], - [0.4634, 0.2463, 0.2720, 0.4275], - [0.3002, 0.2956, 0.5985, 0.5913], - [0.1013, 0.1200, 0.1238, 0.0550], - [0.3297, 0.1656, 0.0347, 0.1312], - [0.2997, 0.2994, 0.5994, 0.5987], - ] - ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check the masks have also been padded - self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) - self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - masks_path=masks_path, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", - ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino def test_max_width_max_height_resizing_and_pad_strategy(self): - image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) - - # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 - image_processor = GroundingDinoImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) - - # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 - image_processor = GroundingDinoImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - - # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 - image_processor = GroundingDinoImageProcessor( - size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) - - # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 - image_processor = GroundingDinoImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=True, - pad_size={"height": 301, "width": 101}, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) - - ### Check for batch - image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) - - # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 - image_processor = GroundingDinoImageProcessor( - size={"max_height": 150, "max_width": 100}, - do_pad=True, - pad_size={"height": 150, "width": 100}, - ) - inputs = image_processor(images=[image_1, image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) def test_longest_edge_shortest_edge_resizing_strategy(self): image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index 1080edd2654ed3..67508532e9c829 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -553,47 +553,48 @@ def test_batched_coco_panoptic_annotations(self): # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos def test_max_width_max_height_resizing_and_pad_strategy(self): - image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) - - # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 - image_processor = YolosImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) - - # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 - image_processor = YolosImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=False, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - - # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 - image_processor = YolosImageProcessor( - size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) - - # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 - image_processor = YolosImageProcessor( - size={"max_height": 300, "max_width": 100}, - do_pad=True, - pad_size={"height": 301, "width": 101}, - ) - inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) - - ### Check for batch - image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) - - # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 - image_processor = YolosImageProcessor( - size={"max_height": 150, "max_width": 100}, - do_pad=True, - pad_size={"height": 150, "width": 100}, - ) - inputs = image_processor(images=[image_1, image_2], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index a3fbb6eeb78178..7d89b43ce35ba4 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -191,7 +191,7 @@ def measure_time(image_processor, image): dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8) image_processor_slow = self.image_processing_class(**self.image_processor_dict) - image_processor_fast = self.fast_image_processing_class() + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) fast_time = measure_time(image_processor_fast, dummy_images) slow_time = measure_time(image_processor_slow, dummy_images) From f701b98e4a10d3b882834308f2f0c9398b19d343 Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 21 Oct 2024 14:35:57 +0100 Subject: [PATCH 057/385] Add a doc section on writing generation prompts (#34248) Add a section on writing generation prompts --- docs/source/en/chat_templating.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index de3d056c916f5f..1bdf05a26c8d08 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -943,6 +943,35 @@ all implementations of Jinja: - Directly rendering a dict or list may give different results in other implementations (for example, string entries might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here. +### Writing generation prompts + +We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template, +and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for +assistant messages, then your template must support adding the header when `add_generation_prompt` is set. + +Here is an example of a template that formats messages ChatML-style, with generation prompt support: + +```text +{{- bos_token }} +{%- for message in messages %} + {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} +``` + +The exact content of the assistant header will depend on your specific model, but it should always be **the string +that represents the start of an assistant message**, so that if the user applies your template with +`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some +models do not need a generation prompt, because assistant messages always begin immediately after user messages. +This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]` +token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag. + +Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then +model generations will likely be severely degraded, or the model may display unusual behaviour like continuing +the final user message! + ### Writing and debugging larger templates When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. From 32590b5ecb50f1c56be32cb0e686196be9427f2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Marafioti?= Date: Mon, 21 Oct 2024 14:21:52 -0300 Subject: [PATCH 058/385] Fix method name which changes in tutorial (#34252) The method `model_download_tool` was called `model_download_counter` earlier in the tutorial, this raises an error when following the code. --- docs/source/en/agents.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index ac06c04d9baaa5..721e348f89fe22 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -332,7 +332,7 @@ This code can quickly be converted into a tool, just by wrapping it in a functio from transformers import tool @tool -def model_download_counter(task: str) -> str: +def model_download_tool(task: str) -> str: """ This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. It returns the name of the checkpoint. @@ -345,7 +345,7 @@ def model_download_counter(task: str) -> str: ``` The function needs: -- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_counter`. +- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`. - Type hints on both inputs and output - A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint). All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible! @@ -367,7 +367,7 @@ You get the following: ======== New task ======== Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? ==== Agent is executing the code below: -most_downloaded_model = model_download_counter(task="text-to-video") +most_downloaded_model = model_download_tool(task="text-to-video") print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") ==== ``` From 21d5025826857e11a75ef7b23ac15a607be4fc54 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 22 Oct 2024 06:54:44 +0200 Subject: [PATCH 059/385] Attn implementation for composite models (#32238) * first try * codestyle * idefics2 is happy * [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo, paligemma * fix-copies * [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo * blip-2 needs to init vision from config * when was this removed O_o * minor fix * tests * this way? * tests * model-agnostic code * codestyle * add tests for idefics * modify general test for VLMs * no generation test for vlm yet! * no generation test here also * wanr in VIT-SDPA if output attn * add more tests * user can pass dict as attn impl * repo consistency * update * muicgen * no prints * forgot speech enc-dec and clip * how many composite models we have? * musicgen meelody is same as mudicgen * +siglip * fix tests + add some more * remove idefics custom overriden code * make idefics2 automappable * nits * skip tests * doctests * Update src/transformers/models/idefics2/configuration_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/clip/test_modeling_clip.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/idefics2/test_modeling_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/idefics2/test_modeling_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/configuration_utils.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * major update, no need for automap * clean up * add FA2 test * more tests * style * skip tests * why did these started failing now? * no attributes for FA2 needed * one tiny test * address comment about FA2 false warning * style * add new models and resolve conflicts * fix copies * let it be this way for now, come back tomorrow to review * some more fixes * update * more updates * update * fix copies * style and tests * another big update * fix tests * fix tests * update * another update * fix tests * fix copies * fix tests --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/perf_infer_gpu_one.md | 16 + src/transformers/configuration_utils.py | 5 + src/transformers/modeling_utils.py | 58 ++- .../modeling_audio_spectrogram_transformer.py | 18 +- .../models/blip_2/modeling_blip_2.py | 17 +- src/transformers/models/clip/modeling_clip.py | 8 +- src/transformers/models/deit/modeling_deit.py | 18 +- .../modeling_encoder_decoder.py | 9 +- .../models/idefics/modeling_idefics.py | 12 - .../models/idefics2/configuration_idefics2.py | 17 +- .../models/idefics2/modeling_idefics2.py | 238 ++++++----- .../models/idefics3/modeling_idefics3.py | 12 +- .../instructblip/modeling_instructblip.py | 9 +- .../modeling_instructblipvideo.py | 9 +- .../models/llava/modeling_llava.py | 15 +- .../models/llava_next/modeling_llava_next.py | 15 +- .../modeling_llava_next_video.py | 15 +- .../modeling_llava_onevision.py | 8 +- .../models/mllama/modeling_mllama.py | 8 +- .../models/musicgen/configuration_musicgen.py | 17 - .../models/musicgen/modeling_musicgen.py | 5 +- .../configuration_musicgen_melody.py | 17 - .../modeling_musicgen_melody.py | 5 +- .../omdet_turbo/modeling_omdet_turbo.py | 2 +- .../models/paligemma/modeling_paligemma.py | 22 +- .../qwen2_audio/modeling_qwen2_audio.py | 15 +- .../models/qwen2_vl/modeling_qwen2_vl.py | 4 +- src/transformers/models/rag/modeling_rag.py | 10 +- .../models/siglip/modeling_siglip.py | 9 +- .../modeling_speech_encoder_decoder.py | 8 +- .../video_llava/modeling_video_llava.py | 15 +- .../models/vipllava/modeling_vipllava.py | 15 +- .../modeling_vision_encoder_decoder.py | 8 +- .../modeling_vision_text_dual_encoder.py | 10 +- src/transformers/models/vit/modeling_vit.py | 18 +- .../models/vit_mae/modeling_vit_mae.py | 18 +- .../models/vit_msn/modeling_vit_msn.py | 18 +- .../models/yolos/modeling_yolos.py | 18 +- tests/models/blip_2/test_modeling_blip_2.py | 123 ++++++ tests/models/clip/test_modeling_clip.py | 83 +++- .../test_modeling_encoder_decoder.py | 74 +++- tests/models/gemma2/test_modeling_gemma2.py | 4 + tests/models/idefics/test_modeling_idefics.py | 12 +- .../models/idefics2/test_modeling_idefics2.py | 40 ++ .../test_modeling_instructblip.py | 62 +++ .../test_modeling_instructblipvideo.py | 62 +++ tests/models/kosmos2/test_modeling_kosmos2.py | 14 +- tests/models/llava/test_modeling_llava.py | 11 + .../llava_next/test_modeling_llava_next.py | 11 + .../test_modeling_llava_next_video.py | 11 + .../test_modeling_llava_onevision.py | 11 + tests/models/mllama/test_modeling_mllama.py | 1 + .../models/musicgen/test_modeling_musicgen.py | 185 ++++++-- .../test_modeling_musicgen_melody.py | 401 +++++++----------- .../paligemma/test_modeling_paligemma.py | 11 + .../qwen2_audio/test_modeling_qwen2_audio.py | 50 +++ tests/models/siglip/test_modeling_siglip.py | 80 +++- .../test_modeling_speech_encoder_decoder.py | 68 ++- .../video_llava/test_modeling_video_llava.py | 11 + .../models/vipllava/test_modeling_vipllava.py | 11 + .../test_modeling_vision_encoder_decoder.py | 314 +++++++++++++- tests/test_modeling_common.py | 242 +++++++++-- tests/utils/test_configuration_utils.py | 1 + utils/check_repo.py | 4 +- 64 files changed, 1925 insertions(+), 713 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 9c03d06d94ad48..67bd31fdaeede5 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -79,6 +79,7 @@ FlashAttention-2 is currently supported for the following architectures: * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) +* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel) * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model) * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel) @@ -88,6 +89,10 @@ FlashAttention-2 is currently supported for the following architectures: * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder) * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel) * [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel) +* [RAG](https://huggingface.co/docs/transformers/model_doc/rag#transformers.RagModel) +* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel) +* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel) +* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel) * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel) * [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model) * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel) @@ -225,6 +230,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) +* [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) @@ -233,11 +239,16 @@ For now, Transformers supports SDPA inference and training for the following arc * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel) * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) +* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model) +* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model) * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel) * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) +* [Llava](https://huggingface.co/docs/transformers/model_doc/llava) +* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next) +* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model) * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi) @@ -277,10 +288,15 @@ For now, Transformers supports SDPA inference and training for the following arc * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) +* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel) +* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava) +* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava) +* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel) * [ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTModel) * [ViTHybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid#transformers.ViTHybridModel) * [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel) * [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel) +* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel) * [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell) * [ViViT](https://huggingface.co/docs/transformers/model_doc/vivit#transformers.VivitModel) * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 8bc08ca625961e..1d892c49a231fc 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -296,6 +296,7 @@ def __init__(self, **kwargs): # Attention implementation to use, if relevant. self._attn_implementation_internal = kwargs.pop("attn_implementation", None) + self._attn_implementation_autoset = False # Drop the transformers version info self.transformers_version = kwargs.pop("transformers_version", None) @@ -776,6 +777,10 @@ def __eq__(self, other): def __repr__(self): return f"{self.__class__.__name__} {self.to_json_string()}" + def __iter__(self): + for attr in self.__dict__: + yield attr + def to_diff_dict(self) -> Dict[str, Any]: """ Removes all attributes from config which correspond to the default config attributes for better readability and diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index c84aec21a32663..a6fbd7b1a91453 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1420,9 +1420,10 @@ def __init__(self, config: PretrainedConfig, *inputs, **kwargs): f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" ) # Save config and origin of the pretrained weights if given in model - config = self._autoset_attn_implementation( - config, torch_dtype=torch.get_default_dtype(), check_device_map=False - ) + if not getattr(config, "_attn_implementation_autoset", False): + config = self._autoset_attn_implementation( + config, torch_dtype=torch.get_default_dtype(), check_device_map=False + ) self.config = config self.name_or_path = config.name_or_path @@ -1500,6 +1501,9 @@ def _from_config(cls, config, **kwargs): torch_dtype (`torch.dtype`, *optional*): Override the default `torch.dtype` and load the model under this dtype. """ + # when we init a model from within another model (e.g. VLMs) and dispatch on FA2 + # a warning is raised that dtype should be fp16. Since we never pass dtype from within + # modeling code, we can try to infer it here same way as done in `from_pretrained` torch_dtype = kwargs.pop("torch_dtype", torch.get_default_dtype()) use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False) @@ -1518,12 +1522,13 @@ def _from_config(cls, config, **kwargs): attn_implementation = None config._attn_implementation = kwargs.pop("attn_implementation", attn_implementation) - config = cls._autoset_attn_implementation( - config, - use_flash_attention_2=use_flash_attention_2, - check_device_map=False, - torch_dtype=torch_dtype, - ) + if not getattr(config, "_attn_implementation_autoset", False): + config = cls._autoset_attn_implementation( + config, + use_flash_attention_2=use_flash_attention_2, + check_device_map=False, + torch_dtype=torch_dtype, + ) if is_deepspeed_zero3_enabled(): import deepspeed @@ -1570,7 +1575,11 @@ def _autoset_attn_implementation( ' We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.' ) - if config._attn_implementation not in ["eager", "sdpa", "flash_attention_2"]: + if not isinstance(config._attn_implementation, dict) and config._attn_implementation not in [ + "eager", + "sdpa", + "flash_attention_2", + ]: message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)' if cls._supports_flash_attn_2: message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)' @@ -1581,6 +1590,22 @@ def _autoset_attn_implementation( # If a config is passed with a preset attn_implementation, we skip the automatic dispatch and use the user-provided config, with hard checks that the requested attention implementation is available. requested_attn_implementation = config._attn_implementation_internal + # Composite models consisting of several PretrainedModels have to specify attention impl as a dict + # where keys are sub-config names. But most people will specify one `str` which means that should dispatch it + # for all sub-models. + # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict. + # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)` + # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238 + for key in config: + if isinstance(getattr(config, key), PretrainedConfig): + sub_config = getattr(config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + sub_config._attn_implementation_internal = curr_attn_implementation + if use_flash_attention_2: logger.warning_once( 'The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.' @@ -1611,9 +1636,12 @@ def _autoset_attn_implementation( "Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends." ) torch.backends.cuda.enable_flash_sdp(False) + elif isinstance(requested_attn_implementation, dict): + config._attn_implementation = None else: config._attn_implementation = "eager" + config._attn_implementation_autoset = True return config @classmethod @@ -2771,6 +2799,9 @@ def save_pretrained( # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] + # Unset attn implementation so it can be set to another one when loading back + model_to_save.config._attn_implementation_autoset = False + # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be # loaded from the Hub. if self._auto_class is not None: @@ -4055,9 +4086,10 @@ def from_pretrained( init_contexts.append(init_empty_weights()) config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained. - config = cls._autoset_attn_implementation( - config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map - ) + if not getattr(config, "_attn_implementation_autoset", False): + config = cls._autoset_attn_implementation( + config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map + ) with ContextManagers(init_contexts): # Let's make sure we don't run the init function of buffer modules diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index beb249202b96c7..491c6ce164611a 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -176,8 +176,24 @@ def __init__(self, config: ASTConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`ASTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 4b0ed4f71d9c95..eba82cd1b3c8e4 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -410,6 +410,7 @@ class Blip2PreTrainedModel(PreTrainedModel): config_class = Blip2Config base_model_prefix = "blip" supports_gradient_checkpointing = True + _no_split_modules = [ "Blip2Attention", "Blip2QFormerMultiHeadAttention", @@ -1455,13 +1456,9 @@ def __init__(self, config: Blip2Config): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config.text_config) else: - language_model = AutoModelForSeq2SeqLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) # Update _tied_weights_keys using the base model used. if language_model._tied_weights_keys is not None: @@ -2020,13 +2017,9 @@ def __init__(self, config: Blip2Config): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config.text_config) else: - language_model = AutoModelForSeq2SeqLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) # Update _tied_weights_keys using the base model used. if language_model._tied_weights_keys is not None: diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index f946f828eec639..04a3a73de0455e 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1204,10 +1204,10 @@ def __init__(self, config: CLIPConfig): self.text_embed_dim = text_config.hidden_size self.vision_embed_dim = vision_config.hidden_size - text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation) + text_model = CLIPTextModel._from_config(text_config) self.text_model = text_model.text_model - vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation) + vision_model = CLIPVisionModel._from_config(vision_config) self.vision_model = vision_model.vision_model self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) @@ -1590,9 +1590,7 @@ def __init__(self, config: CLIPConfig) -> None: super().__init__(config) self.num_labels = config.num_labels - vision_model = CLIPVisionModel._from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + vision_model = CLIPVisionModel._from_config(config.vision_config) self.vision_model = vision_model.vision_model # Classifier head diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 03194c15d98f1c..e0b053e43906b8 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -248,8 +248,24 @@ def __init__(self, config: DeiTConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`DeiTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index d1029160dd0cc2..9ebedce07fb833 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -180,6 +180,8 @@ class EncoderDecoderModel(PreTrainedModel, GenerationMixin): main_input_name = "input_ids" supports_gradient_checkpointing = True _supports_param_buffer_assignment = False + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, @@ -210,12 +212,12 @@ def __init__( if encoder is None: from ..auto.modeling_auto import AutoModel - encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) + encoder = AutoModel.from_config(config.encoder) if decoder is None: from ..auto.modeling_auto import AutoModelForCausalLM - decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) + decoder = AutoModelForCausalLM.from_config(config.decoder) self.encoder = encoder self.decoder = decoder @@ -233,6 +235,9 @@ def __init__( # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + # update `_attn_implementation` because the attn is set in a deepcopied config within PreTrainedModel + self.config.encoder._attn_implementation = self.encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation self.encoder.config = self.config.encoder self.decoder.config = self.config.decoder diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index bc983744559fc9..8bd24728b03885 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -933,18 +933,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa - @classmethod - def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig: - # We remove the checks on `is_torch_sdpa_available()` and `cls._supports_sdpa` as Falcon supports SDPA from torch==2.0.0 (no requirement on 2.1). - _is_bettertransformer = getattr(cls, "use_bettertransformer", False) - if _is_bettertransformer: - return config - - if not hard_check_only: - config._attn_implementation = "sdpa" - return config - LLAMA_INPUTS_DOCSTRING = r""" Args: diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 1333895407e6e5..64743d1cd470e7 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -57,7 +57,7 @@ class Idefics2VisionConfig(PretrainedConfig): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - intializer_range (`float`, *optional*, defaults to 0.02): + initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation for initializing all weight matrices in the model. Example: @@ -134,6 +134,10 @@ class Idefics2PerceiverConfig(PretrainedConfig): Args: hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the perceiver block. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. resampler_n_latents (`int`, *optional*, defaults to 64): Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). resampler_depth (`int`, *optional*, defaults to 3): @@ -153,6 +157,8 @@ class Idefics2PerceiverConfig(PretrainedConfig): def __init__( self, hidden_act="silu", + hidden_size=4096, + rms_norm_eps=1e-06, resampler_n_latents=64, resampler_depth=3, resampler_n_heads=16, @@ -162,6 +168,8 @@ def __init__( **kwargs, ): self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.rms_norm_eps = rms_norm_eps self.resampler_n_latents = resampler_n_latents self.resampler_depth = resampler_depth self.resampler_n_heads = resampler_n_heads @@ -258,5 +266,12 @@ def __init__( ) self.text_config = text_config + if self.text_config.hidden_size != self.perceiver_config.hidden_size: + self.perceiver_config.hidden_size = self.text_config.hidden_size + self.perceiver_config.rms_norm_eps = self.text_config.rms_norm_eps + logger.warning_once( + "Perceiver config has a different `hidden_size` than text config, which means default values were used. " + "In your model's config on the hub, add `hidden_size` and `rms_norm_eps` keys under the `perceiver_config` dict. " + ) super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index daa8bfb055b561..3d46c3bd82e788 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint @@ -38,7 +38,7 @@ replace_return_docstrings, ) from ..auto import AutoModel -from .configuration_idefics2 import Idefics2Config, Idefics2VisionConfig +from .configuration_idefics2 import Idefics2Config, Idefics2PerceiverConfig, Idefics2VisionConfig if is_flash_attn_2_available(): @@ -572,9 +572,86 @@ def forward( ) -class Idefics2VisionTransformer(nn.Module): +IDEFICS2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Idefics2Config`] or [`Idefics2VisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Idefics2 Model outputting raw hidden-states without any specific head on top.", + IDEFICS2_START_DOCSTRING, +) +class Idefics2PreTrainedModel(PreTrainedModel): + config_class = Idefics2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = ( + self.config.text_config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if hasattr(module, "class_embedding"): + module.class_embedding.data.normal_(mean=0.0, std=std) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +IDEFICS2_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses + [`CLIPImageProcessor`] for processing images). + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): + Mask to avoid performing attention on padding pixel indices. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """Idefics2 vision encoder model that returnss raw image embeddings.""", + IDEFICS2_START_DOCSTRING, +) +class Idefics2VisionTransformer(Idefics2PreTrainedModel): + _supports_sdpa = False + config_class = Idefics2VisionConfig + def __init__(self, config: Idefics2VisionConfig): - super().__init__() + super().__init__(config) embed_dim = config.hidden_size self.config = config @@ -687,12 +764,12 @@ def __init__(self, config, layer_idx: Optional[int] = None) -> None: super().__init__() self.layer_idx = None - self.hidden_size = config.text_config.hidden_size - self.num_heads = config.perceiver_config.resampler_n_heads - self.head_dim = config.perceiver_config.resampler_head_dim - self.num_key_value_heads = config.perceiver_config.num_key_value_heads + self.hidden_size = config.hidden_size + self.num_heads = config.resampler_n_heads + self.head_dim = config.resampler_head_dim + self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.attention_dropout = config.perceiver_config.attention_dropout + self.attention_dropout = config.attention_dropout self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) @@ -918,20 +995,20 @@ def forward( class Idefics2PerceiverLayer(nn.Module): def __init__(self, config, layer_idx: int): super().__init__() - self.hidden_size = config.text_config.hidden_size - self.n_latents = config.perceiver_config.resampler_n_latents - self.depth = config.perceiver_config.resampler_depth - self.rms_norm_eps = config.text_config.rms_norm_eps + self.hidden_size = config.hidden_size + self.n_latents = config.resampler_n_latents + self.depth = config.resampler_depth + self.rms_norm_eps = config.rms_norm_eps self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) self.self_attn = IDEFICS2_PERCEIVER_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) self.mlp = Idefics2MLP( - hidden_size=config.text_config.hidden_size, - intermediate_size=config.text_config.hidden_size * 4, - output_size=config.text_config.hidden_size, - hidden_act=config.perceiver_config.hidden_act, + hidden_size=config.hidden_size, + intermediate_size=config.hidden_size * 4, + output_size=config.hidden_size, + hidden_act=config.hidden_act, ) def forward( @@ -987,20 +1064,37 @@ def forward( return outputs -class Idefics2PerceiverResampler(nn.Module): +IDEFICS2_INPUTS_DOCSTRING = r""" + Args: + context (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): + The hidden states of the image after vision encoder and modality projection. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) +""" + + +@add_start_docstrings( + "Idefics2 perceiver resampler model that performs `depth` blocks of cross-attention with a fixed ", + "`n_latents` inputs to decrease embedding sequence length. The Resampler acts as a form of learned pooling and ", + "is derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206)", + IDEFICS2_START_DOCSTRING, +) +class Idefics2PerceiverResampler(Idefics2PreTrainedModel): + _supports_sdpa = False + config_class = Idefics2PerceiverConfig + def __init__(self, config) -> None: - """ - Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or - MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then - returns a Tensor of shape [bsz, n_latents, embed_dim]. The Resampler acts as a form of learned pooling and - is derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206). - """ - super().__init__() - self.hidden_size = config.text_config.hidden_size - self.hidden_act = config.perceiver_config.hidden_act - self.n_latents = config.perceiver_config.resampler_n_latents - self.depth = config.perceiver_config.resampler_depth - self.rms_norm_eps = config.text_config.rms_norm_eps + super().__init__(config) + self.hidden_size = config.hidden_size + self.hidden_act = config.hidden_act + self.n_latents = config.resampler_n_latents + self.depth = config.resampler_depth + self.rms_norm_eps = config.rms_norm_eps # Create Latents for Perceiver self.latents = nn.Parameter(torch.ones(self.n_latents, self.hidden_size)) @@ -1014,7 +1108,7 @@ def __init__(self, config) -> None: def forward( self, context: torch.Tensor, - attention_mask, + attention_mask: torch.Tensor, ) -> torch.Tensor: # seq embed -> bsz seq embed latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size())) @@ -1057,7 +1151,7 @@ def __init__(self, config): output_size=config.text_config.hidden_size, hidden_act=config.text_config.hidden_act, ) - self.perceiver_resampler = Idefics2PerceiverResampler(config) + self.perceiver_resampler = Idefics2PerceiverResampler._from_config(config.perceiver_config) def forward(self, image_hidden_states, attention_mask): image_hidden_states = self.modality_projection(image_hidden_states) @@ -1065,80 +1159,6 @@ def forward(self, image_hidden_states, attention_mask): return image_hidden_states -IDEFICS2_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`Idefics2Config`] or [`Idefics2VisionConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Idefics2 Model outputting raw hidden-states without any specific head on top.", - IDEFICS2_START_DOCSTRING, -) -class Idefics2PreTrainedModel(PreTrainedModel): - config_class = Idefics2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_cache_class = True - - def _init_weights(self, module): - std = ( - self.config.initializer_range - if hasattr(self.config, "initializer_range") - else self.config.text_config.initializer_range - ) - - if hasattr(module, "class_embedding"): - module.class_embedding.data.normal_(mean=0.0, std=std) - - if isinstance(module, (nn.Linear, nn.Conv2d)): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - @classmethod - def _autoset_attn_implementation( - cls, - config, - use_flash_attention_2: bool = False, - torch_dtype: Optional[torch.dtype] = None, - device_map: Optional[Union[str, Dict[str, int]]] = None, - check_device_map: bool = True, - **kwargs, - ): - """ - Overrides the method in `PreTrainedModel` to update the vision config with the correct attention implementation - """ - config = super()._autoset_attn_implementation( - config=config, - use_flash_attention_2=use_flash_attention_2, - torch_dtype=torch_dtype, - device_map=device_map, - check_device_map=check_device_map, - **kwargs, - ) - config.vision_config._attn_implementation = config._attn_implementation - return config - - IDEFICS2_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -1219,14 +1239,14 @@ def __init__(self, config: Idefics2Config): self.padding_idx = self.config.text_config.pad_token_id self.vocab_size = self.config.text_config.vocab_size - self.vision_model = Idefics2VisionTransformer(config.vision_config) + self.vision_model = Idefics2VisionTransformer._from_config(config.vision_config) self.connector = Idefics2Connector(config) - self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) + self.text_model = AutoModel.from_config(config.text_config) self.image_seq_len = config.perceiver_config.resampler_n_latents self.image_token_id = self.config.image_token_id - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" self.post_init() diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 748eda8c026377..31d43948fbd565 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -621,12 +621,13 @@ class Idefics3PreTrainedModel(PreTrainedModel): _no_split_modules = ["Idefics3VisionAttention", "Idefics3DecoderLayer"] _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True + _supports_sdpa = True _supports_cache_class = True # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2PreTrainedModel._init_weights def _init_weights(self, module): std = ( - self.config.initializer_range + self.config.text_config.initializer_range if hasattr(self.config, "initializer_range") else self.config.text_config.initializer_range ) @@ -667,6 +668,7 @@ def _init_weights(self, module): ) class Idefics3VisionTransformer(Idefics3PreTrainedModel): config_class = Idefics3VisionConfig + _supports_sdpa = False def __init__(self, config: Idefics3VisionConfig): super().__init__(config) @@ -824,18 +826,16 @@ def __init__(self, config: Idefics3Config): self.padding_idx = self.config.text_config.pad_token_id self.vocab_size = self.config.text_config.vocab_size - self.vision_model = Idefics3VisionTransformer._from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + self.vision_model = Idefics3VisionTransformer._from_config(config.vision_config) self.connector = Idefics3Connector(config) - self.text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) + self.text_model = AutoModel.from_config(config.text_config) self.image_seq_len = int( ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2) ) self.image_token_id = self.config.image_token_id - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" self.post_init() diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index de4e84b82f8377..5cce774ce0716a 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -315,6 +315,7 @@ class InstructBlipPreTrainedModel(PreTrainedModel): config_class = InstructBlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True + _no_split_modules = [ "InstructBlipQFormerEmbeddings", "InstructBlipAttention", @@ -1298,13 +1299,9 @@ def __init__(self, config: InstructBlipConfig): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config.text_config) else: - language_model = AutoModelForSeq2SeqLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) if language_model._no_split_modules is not None: self._no_split_modules.extend(language_model._no_split_modules) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index a300268ed71327..c9f12391666c22 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -317,6 +317,7 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel): config_class = InstructBlipVideoConfig base_model_prefix = "blip" supports_gradient_checkpointing = True + _no_split_modules = [ "InstructBlipVideoQFormerEmbeddings", "InstructBlipVideoAttention", @@ -1292,13 +1293,9 @@ def __init__(self, config: InstructBlipVideoConfig): self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config.text_config) else: - language_model = AutoModelForSeq2SeqLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) if language_model._no_split_modules is not None: self._no_split_modules.extend(language_model._no_split_modules) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 31593bc62d098c..c17d35296a9c77 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -125,8 +125,9 @@ class LlavaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlavaVisionAttention"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of Llava isn't meant for training from scratch - only @@ -150,14 +151,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - LLAVA_INPUTS_DOCSTRING = r""" Args: @@ -245,9 +238,7 @@ def __init__(self, config: LlavaConfig): self.multi_modal_projector = LlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 03ab28dfff9cb1..04ff098170b7a3 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -234,8 +234,9 @@ class LlavaNextPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlavaNextVisionAttention"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of LlavaNext isn't meant for training from scratch - only @@ -259,14 +260,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - LLAVA_NEXT_INPUTS_DOCSTRING = r""" Args: @@ -360,9 +353,7 @@ def __init__(self, config: LlavaNextConfig): self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.post_init() diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 3fd6bb47fc7661..8d3bfb1efa4e85 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -277,8 +277,9 @@ class LlavaNextVideoPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlavaNextVideoVisionAttention"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of LlavaNextVideo isn't meant for training from scratch - only @@ -302,14 +303,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING = r""" Args: @@ -406,9 +399,7 @@ def __init__( self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.vision_resampler = LlavaNextVideoPooler(config) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 7bacd2a54fc97f..2c5fa511467aff 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -363,18 +363,14 @@ def _init_weights(self, module): class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin): def __init__(self, config: LlavaOnevisionConfig): super().__init__(config) - self.vision_tower = AutoModel.from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + self.vision_tower = AutoModel.from_config(config.vision_config) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) embed_std = 1 / math.sqrt(config.text_config.hidden_size) self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.post_init() # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index d1cc3a13bf3cc3..c5ae615a12b5cc 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1979,12 +1979,8 @@ def __init__(self, config: MllamaConfig): self.vision_output_dim = config.vision_config.vision_output_dim self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 - self.vision_model = MllamaVisionModel._from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) - self.language_model = MllamaForCausalLM._from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.vision_model = MllamaVisionModel._from_config(config.vision_config) + self.language_model = MllamaForCausalLM._from_config(config.text_config) self.multi_modal_projector = nn.Linear( config.vision_config.vision_output_dim, config.text_config.hidden_size, diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index ef2e0244c1406f..0d282355defa96 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -236,20 +236,3 @@ def from_sub_models_config( # This is a property because you might want to change the codec model on the fly def sampling_rate(self): return self.audio_encoder.sampling_rate - - @property - def _attn_implementation(self): - # This property is made private for now (as it cannot be changed and a PreTrainedModel.use_attn_implementation method needs to be implemented.) - if hasattr(self, "_attn_implementation_internal"): - if self._attn_implementation_internal is None: - # `config.attn_implementation` should never be None, for backward compatibility. - return "eager" - else: - return self._attn_implementation_internal - else: - return "eager" - - @_attn_implementation.setter - def _attn_implementation(self, value): - self._attn_implementation_internal = value - self.decoder._attn_implementation = value diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 626097f5c7cbcc..c18e1d1c9d86b1 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1713,7 +1713,7 @@ def __init__( audio_encoder = AutoModel.from_config(config.audio_encoder) if decoder is None: - decoder = MusicgenForCausalLM(config.decoder) + decoder = MusicgenForCausalLM._from_config(config.decoder) self.text_encoder = text_encoder self.audio_encoder = audio_encoder @@ -1737,6 +1737,9 @@ def __init__( # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + self.config.text_encoder._attn_implementation = self.text_encoder.config._attn_implementation + self.config.audio_encoder._attn_implementation = self.audio_encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation self.text_encoder.config = self.config.text_encoder self.audio_encoder.config = self.config.audio_encoder self.decoder.config = self.config.decoder diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index b29187facb3d1b..8a77cea0252234 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -250,20 +250,3 @@ def from_sub_models_config( # This is a property because you might want to change the codec model on the fly def sampling_rate(self): return self.audio_encoder.sampling_rate - - @property - def _attn_implementation(self): - # This property is made private for now (as it cannot be changed and a PreTrainedModel.use_attn_implementation method needs to be implemented.) - if hasattr(self, "_attn_implementation_internal"): - if self._attn_implementation_internal is None: - # `config.attn_implementation` should never be None, for backward compatibility. - return "eager" - else: - return self._attn_implementation_internal - else: - return "eager" - - @_attn_implementation.setter - def _attn_implementation(self, value): - self._attn_implementation_internal = value - self.decoder._attn_implementation = value diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 166623796d65d0..d2f339afc41451 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1628,7 +1628,7 @@ def __init__( audio_encoder = AutoModel.from_config(config.audio_encoder) if decoder is None: - decoder = MusicgenMelodyForCausalLM(config.decoder) + decoder = MusicgenMelodyForCausalLM._from_config(config.decoder) self.text_encoder = text_encoder self.audio_encoder = audio_encoder @@ -1636,6 +1636,9 @@ def __init__( # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + self.config.text_encoder._attn_implementation = self.text_encoder.config._attn_implementation + self.config.audio_encoder._attn_implementation = self.audio_encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation self.text_encoder.config = self.config.text_encoder self.audio_encoder.config = self.config.audio_encoder self.decoder.config = self.config.decoder diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index bf9dbd951b5b06..0f44e4bd40208c 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -288,7 +288,7 @@ def put(self, key, value) -> None: class OmDetTurboLanguageBackbone(nn.Module): def __init__(self, config: OmDetTurboConfig): super().__init__() - self.model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) + self.model = AutoModel.from_config(config.text_config) self.text_projection = nn.Parameter(torch.zeros(config.text_projection_in_dim, config.text_projection_out_dim)) def forward(self, hidden_states, mask=None, encode_type="task"): diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 1607261eaac673..ffb4b7435f2a2a 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -193,12 +193,12 @@ class PaliGemmaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["PaliGemmaMultiModalProjector"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = False _supports_cache_class = True _supports_quantized_cache = True _supports_static_cache = True - _supports_sdpa = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of PaliGemmaisn't meant for training from scratch - only @@ -221,14 +221,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - PALIGEMMA_INPUTS_DOCSTRING = r""" Args: @@ -310,11 +302,8 @@ def __init__(self, config: PaliGemmaConfig): self.vision_tower = AutoModel.from_config(config=config.vision_config) self.multi_modal_projector = PaliGemmaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self._attn_implementation = config._attn_implementation - language_model = AutoModelForCausalLM.from_config( - config=config.text_config, attn_implementation=self._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config=config.text_config) if language_model._tied_weights_keys is not None: self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] @@ -354,6 +343,11 @@ def tie_weights(self): def _update_causal_mask( self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False ): + if self.config.text_config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + using_static_cache = isinstance(past_key_values, StaticCache) dtype = inputs_embeds.dtype min_dtype = torch.finfo(dtype).min diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index e923e535da8e34..ce0e427048cf23 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -544,6 +544,7 @@ class Qwen2AudioPreTrainedModel(PreTrainedModel): _no_split_modules = ["Qwen2AudioAttention"] _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of Qwen2Audio isn't meant for training from scratch - only @@ -559,14 +560,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - QWEN2AUDIOENCODER_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -859,13 +852,11 @@ def forward(self, audio_features): class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMixin): def __init__(self, config: Qwen2AudioConfig): super().__init__(config) - self.audio_tower = AutoModel.from_config(config.audio_config, attn_implementation=config._attn_implementation) + self.audio_tower = AutoModel.from_config(config.audio_config) self.multi_modal_projector = Qwen2AudioMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.post_init() diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index f4cb84a2444eb6..07531248f63b1d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1443,9 +1443,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin): def __init__(self, config): super().__init__(config) - self.visual = Qwen2VisionTransformerPretrainedModel._from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config) self.model = Qwen2VLModel(config) self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 5e6f13ca478f32..dfc2664b78a3dc 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -232,6 +232,8 @@ class RagPreTrainedModel(PreTrainedModel): config_class = RagConfig base_model_prefix = "rag" + _supports_flash_attn_2 = True + _supports_sdpa = True @classmethod def from_pretrained(cls, *args, **kwargs): @@ -506,16 +508,12 @@ def __init__( if question_encoder is None: from ..auto.modeling_auto import AutoModel - question_encoder = AutoModel.from_config( - config.question_encoder, attn_implementation=config._attn_implementation - ) + question_encoder = AutoModel.from_config(config.question_encoder) if generator is None: from ..auto.modeling_auto import AutoModelForSeq2SeqLM - generator = AutoModelForSeq2SeqLM.from_config( - config.generator, attn_implementation=config._attn_implementation - ) + generator = AutoModelForSeq2SeqLM.from_config(config.generator) self.retriever = retriever if self.retriever is not None: diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index 507e0768a226ef..a3d06cbb4792b4 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -669,6 +669,7 @@ class SiglipPreTrainedModel(PreTrainedModel): config_class = SiglipConfig base_model_prefix = "siglip" supports_gradient_checkpointing = True + _no_split_modules = [ "SiglipTextEmbeddings", "SiglipEncoderLayer", @@ -1218,8 +1219,8 @@ def __init__(self, config: SiglipConfig): vision_config = config.vision_config # First, initialize the text and vision models with proper attention implementation - text_model = SiglipTextModel._from_config(text_config, attn_implementation=config._attn_implementation) - vision_model = SiglipVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation) + text_model = SiglipTextModel._from_config(text_config) + vision_model = SiglipVisionModel._from_config(vision_config) # Second, get the text and vision submodules (for backward compatibility) self.text_model = text_model.text_model @@ -1454,9 +1455,7 @@ def __init__(self, config: SiglipConfig) -> None: # Create the vision model with proper attention # and take only vision_model submodule (for backward compatibility) - vision_model = SiglipVisionModel._from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + vision_model = SiglipVisionModel._from_config(config.vision_config) self.vision_model = vision_model.vision_model # Classifier head diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index a1caa7cf6da2f7..0d2b911bebe582 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -183,6 +183,8 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): main_input_name = "inputs" supports_gradient_checkpointing = True _supports_param_buffer_assignment = False + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, @@ -213,10 +215,10 @@ def __init__( super().__init__(config) if encoder is None: - encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) + encoder = AutoModel.from_config(config.encoder) if decoder is None: - decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) + decoder = AutoModelForCausalLM.from_config(config.decoder) self.encoder = encoder self.decoder = decoder @@ -234,6 +236,8 @@ def __init__( # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + self.config.encoder._attn_implementation = self.encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation self.encoder.config = self.config.encoder self.decoder.config = self.config.decoder diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index c9703d263e7d20..b455040059e653 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -126,8 +126,9 @@ class VideoLlavaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["VideoLlavaVisionAttention"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): std = ( @@ -148,14 +149,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - VIDEO_LLAVA_INPUTS_DOCSTRING = r""" Args: @@ -248,9 +241,7 @@ def __init__(self, config: VideoLlavaConfig): self.multi_modal_projector = VideoLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 3af32a9caace0e..10935c0b63e076 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -132,8 +132,9 @@ class VipLlavaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["VipLlavaVisionAttention"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of VipLlava isn't meant for training from scratch - only @@ -157,14 +158,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - VIPLLAVA_INPUTS_DOCSTRING = r""" Args: @@ -248,9 +241,7 @@ def __init__(self, config: VipLlavaConfig): self.multi_modal_projector = VipLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index b044dda300ab48..152a9601403301 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -161,6 +161,8 @@ class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): main_input_name = "pixel_values" supports_gradient_checkpointing = True _supports_param_buffer_assignment = False + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, @@ -191,10 +193,10 @@ def __init__( super().__init__(config) if encoder is None: - encoder = AutoModel.from_config(config.encoder, attn_implementation=config._attn_implementation) + encoder = AutoModel.from_config(config.encoder) if decoder is None: - decoder = AutoModelForCausalLM.from_config(config.decoder, attn_implementation=config._attn_implementation) + decoder = AutoModelForCausalLM.from_config(config.decoder) self.encoder = encoder self.decoder = decoder @@ -212,6 +214,8 @@ def __init__( # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + self.config.encoder._attn_implementation = self.encoder.config._attn_implementation + self.config.decoder._attn_implementation = self.decoder.config._attn_implementation self.encoder.config = self.config.encoder self.decoder.config = self.config.decoder diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 5b90faa8862c97..4b39de3df1c882 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -161,6 +161,8 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor: class VisionTextDualEncoderModel(PreTrainedModel): config_class = VisionTextDualEncoderConfig base_model_prefix = "vision_text_dual_encoder" + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, @@ -184,18 +186,18 @@ def __init__( if isinstance(config.vision_config, CLIPVisionConfig): vision_model = CLIPVisionModel(config.vision_config) else: - vision_model = AutoModel.from_config( - config.vision_config, attn_implementation=config._attn_implementation - ) + vision_model = AutoModel.from_config(config.vision_config) if text_model is None: - text_model = AutoModel.from_config(config.text_config, attn_implementation=config._attn_implementation) + text_model = AutoModel.from_config(config.text_config) self.vision_model = vision_model self.text_model = text_model # make sure that the individual model's config refers to the shared config # so that the updates to the config will be synced + self.config.vision_config._attn_implementation = self.vision_model.config._attn_implementation + self.config.text_config._attn_implementation = self.text_model.config._attn_implementation self.vision_model.config = self.config.vision_config self.text_model.config = self.config.text_config diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 76ebd18ed32d7b..bb08acfc0bba67 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -250,8 +250,24 @@ def __init__(self, config: ViTConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`ViTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 0be169a51b276d..e319f2f655aabf 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -424,8 +424,24 @@ def __init__(self, config: ViTMAEConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`ViTMAESdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index b962ac597dabb8..39274dd28fef5b 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -241,8 +241,24 @@ def __init__(self, config: ViTMSNConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`ViTMSNSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 2d00c973b85c18..f7ef3e55f5f799 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -299,8 +299,24 @@ def __init__(self, config: YolosConfig) -> None: self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`YolosSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + mixed_query_layer = self.query(hidden_states) key_layer = self.transpose_for_scores(self.key(hidden_states)) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index f2ccb2da8dba94..e5d04bd85a3404 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -27,6 +27,7 @@ require_torch_fp16, require_torch_gpu, require_torch_multi_accelerator, + require_torch_sdpa, require_vision, slow, torch_device, @@ -456,6 +457,7 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT test_resize_embeddings = False test_attention_outputs = False test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self) @@ -488,6 +490,66 @@ def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_to_base(self): pass + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -715,6 +777,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi test_resize_embeddings = False test_attention_outputs = False test_torchscript = False + _is_composite = True # TODO: Fix the failed tests def is_pipeline_test_to_skip( @@ -768,6 +831,66 @@ def test_save_load_fast_init_to_base(self): def test_cpu_offload(self): pass + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 88824756a6fb54..a7c8c8ef8410e8 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -191,6 +191,53 @@ class CLIPModelTesterMixin(ModelTesterMixin): different output logits, and are not supposed to be used or tested with padding_side="left". """ + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # Load the model with SDPA + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # Load model with eager attention + model_eager = model_class.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + # SigLip has one shared cls attr for all models, so we assign both submodels heer + vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"): + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn) + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.text_model.config._attn_implementation == "eager") + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + def test_eager_matches_sdpa_inference( self, torch_dtype: str, @@ -252,24 +299,6 @@ def get_mean_reldiff(msg, current_case, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time, # but it would be nicer to have an efficient way to use parameterized.expand cases = [ @@ -461,6 +490,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(None,), ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + class CLIPTextModelTester: def __init__( @@ -639,6 +672,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(None, "right"), # "left" is not supported for text model ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + @require_torch_sdpa def test_sdpa_can_dispatch_on_flash(self): self.skipTest(reason="CLIPTextModel has two attention masks: `causal_attention_mask` and `attention_mask`") @@ -704,6 +741,7 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_pruning = False test_resize_embeddings = False test_attention_outputs = False + _is_composite = True def setUp(self): self.model_tester = CLIPModelTester(self) @@ -975,6 +1013,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(None, "right"), # "left" is not supported for text model ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + @require_torch_sdpa def test_sdpa_can_dispatch_on_flash(self): self.skipTest(reason="CLIP text tower has two attention masks: `causal_attention_mask` and `attention_mask`") @@ -1104,6 +1146,7 @@ class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMi test_pruning = False test_resize_embeddings = False test_attention_outputs = False + _is_composite = True def setUp(self): self.model_tester = CLIPForImageClassificationModelTester(self) @@ -1143,6 +1186,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(None,), ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 5e5263b6afb98c..0ee4b75ed803e3 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -18,7 +18,14 @@ import unittest from transformers import is_torch_available, logging -from transformers.testing_utils import CaptureLogger, require_deterministic_for_xpu, require_torch, slow, torch_device +from transformers.testing_utils import ( + CaptureLogger, + require_deterministic_for_xpu, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) from ...test_modeling_common import ids_tensor from ..bart.test_modeling_bart import BartStandaloneDecoderModelTester @@ -54,6 +61,8 @@ @require_torch class EncoderDecoderMixin: + supports_sdpa = False + def get_encoder_decoder_model(self, config, decoder_config): raise NotImplementedError @@ -670,6 +679,67 @@ def test_real_model_save_load_from_pretrained(self): max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + if not self.supports_sdpa: + self.skipTest("SDPA is not supported") + + inputs_dict = self.prepare_config_and_inputs() + encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"] + config = EncoderDecoderConfig.from_encoder_decoder_configs( + encoder_config=encoder_config, decoder_config=decoder_config + ) + model = EncoderDecoderModel(config=config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = EncoderDecoderModel.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # see https://github.com/huggingface/transformers/pull/32238 + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + + # Also test that nothing break if we request SDPA explicitly, when both sub-parts support it. + # If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely + # Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support + if encoder_attn == "sdpa" and decoder_attn == "sdpa": + model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device) + + self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa") + else: + with self.assertRaises(ValueError): + model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + + model_eager = EncoderDecoderModel.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + @require_torch class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): @@ -949,6 +1019,8 @@ def get_pretrained_model(self): @require_torch class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True + def get_encoder_decoder_model(self, config, decoder_config): encoder_model = BertModel(config) decoder_model = GPT2LMHeadModel(decoder_config) diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index 8f9a918dca0082..94670803daa998 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -88,6 +88,10 @@ def setUp(self): def test_model_outputs_equivalence(self, **kwargs): pass + @unittest.skip("Gemma2's forcefully disables sdpa due to softcapping") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different") def test_eager_matches_sdpa_inference(self): diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 250c47c3a7e8ce..bbade169550f8c 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -580,11 +580,9 @@ def test_model_from_pretrained(self): model = IdeficsModel.from_pretrained(model_name) self.assertIsNotNone(model) - @require_torch_sdpa - @slow - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") + @unittest.skip("Idefics has a hard requirement on SDPA") + def test_sdpa_can_dispatch_non_composite_models(self): + pass @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @@ -806,6 +804,10 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip("Idefics has a hard requirement on SDPA") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 4071fcbb232805..854b8b934578e0 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -16,6 +16,7 @@ import copy import gc +import tempfile import unittest from io import BytesIO @@ -36,6 +37,7 @@ require_torch, require_torch_gpu, require_torch_multi_gpu, + require_torch_sdpa, slow, torch_device, ) @@ -180,6 +182,7 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = False test_resize_embeddings = True test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) @@ -327,6 +330,43 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + vision_attn = None if model.vision_model._supports_sdpa else "eager" + perceiver_attn = None if model.connector.perceiver_resampler._supports_sdpa else "eager" + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == perceiver_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + @require_torch class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 8292567334bf3b..5182ac20cd993e 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -32,6 +32,7 @@ require_accelerate, require_bitsandbytes, require_torch, + require_torch_sdpa, require_vision, slow, torch_device, @@ -460,6 +461,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene test_resize_embeddings = False test_attention_outputs = False test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self) @@ -529,6 +531,66 @@ def test_model_from_pretrained(self): model = InstructBlipForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 8a9326c22ac11c..298c7a8d7ff46f 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -32,6 +32,7 @@ require_accelerate, require_bitsandbytes, require_torch, + require_torch_sdpa, require_vision, slow, torch_device, @@ -481,6 +482,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( test_resize_embeddings = False test_attention_outputs = False test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) @@ -550,6 +552,66 @@ def test_model_from_pretrained(self): model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + # We will verify our results on an image of cute cats def prepare_video(): diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 22cbffcfdb6b13..de6c0b15d661f9 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -25,8 +25,17 @@ from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig -from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device -from transformers.utils import is_torch_available, is_vision_available +from transformers.testing_utils import ( + IS_ROCM_SYSTEM, + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import ( + is_torch_available, + is_vision_available, +) from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -257,6 +266,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) test_pruning = False test_resize_embeddings = False test_attention_outputs = False + _is_composite = True # TODO: `image-to-text` pipeline for this model needs Processor. def is_pipeline_test_to_skip( diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 07415900bb93db..405fad1bd31c8d 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -186,6 +186,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {} test_pruning = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = LlavaVisionText2TextModelTester(self) @@ -260,6 +261,16 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index a54aeab8a28252..6589bf14d24c65 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -218,6 +218,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = LlavaNextVisionText2TextModelTester(self) @@ -316,6 +317,16 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 30eaa7fb050c7c..05fc8a49e1e9b9 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -236,6 +236,7 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) @@ -340,6 +341,16 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 0e9c88cb3463fd..0a33898b63072b 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -219,6 +219,7 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = LlavaOnevisionVisionText2TextModelTester(self) @@ -306,6 +307,16 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_assisted_decoding_with_num_logits_to_keep(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 5c5ca3985ee08f..fafa2f71331ba3 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -274,6 +274,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester test_pruning = False test_head_masking = False test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = MllamaVisionText2TextModelTester(self) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index cc30238c8df9f5..438178bfc6faa2 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -654,8 +654,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) model_sdpa = model_sdpa.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, @@ -663,20 +661,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] @@ -1042,6 +1026,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # not to test torchscript as the model tester doesn't prepare `input_values` and `padding_mask` # (and `torchscript` hates `None` values). test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = MusicgenTester(self) @@ -1420,7 +1405,7 @@ def test_save_load_fast_init_from_base(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence def test_flash_attn_2_inference_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -1432,7 +1417,9 @@ def test_flash_attn_2_inference_equivalence(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, ) model_fa.to(torch_device) @@ -1505,7 +1492,88 @@ def test_flash_attn_2_inference_equivalence(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_conversion(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, + ).to(torch_device) + + for _, module in model.named_modules(): + if "FlashAttention" in module.__class__.__name__: + return + + self.assertTrue(False, "FlashAttention2 modules not found in model") + + @require_torch_sdpa + @require_torch_gpu + @slow + def test_sdpa_can_dispatch_on_flash(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + torch.compiler.reset() + compute_capability = torch.cuda.get_device_capability() + major, _ = compute_capability + + if not torch.version.cuda or major < 8: + self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0") + + for model_class in self.all_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]: + self.skipTest( + reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input" + ) + if config.model_type in ["paligemma"]: + self.skipTest( + "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input" + ) + if config.model_type in ["idefics", "idefics2", "idefics3"]: + self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input") + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation={"decoder": "sdpa", "audio_encoder": None, "text_encoder": None}, + ) + model.to(torch_device) + + inputs_dict.pop("attention_mask", None) + inputs_dict.pop("decoder_attention_mask", None) + + for name, inp in inputs_dict.items(): + if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]: + inputs_dict[name] = inp.to(torch.float16) + + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + _ = model(**inputs_dict) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding def test_flash_attn_2_inference_equivalence_right_padding(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -1517,7 +1585,9 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, ) model_fa.to(torch_device) @@ -1587,7 +1657,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding def test_flash_attn_2_generate_left_padding(self): # Ignore copy for model_class in self.greedy_sample_model_classes: @@ -1622,7 +1692,7 @@ def test_flash_attn_2_generate_left_padding(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1636,7 +1706,7 @@ def test_flash_attn_2_generate_left_padding(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right def test_flash_attn_2_generate_padding_right(self): # Ignore copy for model_class in self.greedy_sample_model_classes: @@ -1670,7 +1740,7 @@ def test_flash_attn_2_generate_padding_right(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1684,7 +1754,7 @@ def test_flash_attn_2_generate_padding_right(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache def test_flash_attn_2_generate_use_cache(self): max_new_tokens = 30 @@ -1713,7 +1783,7 @@ def test_flash_attn_2_generate_use_cache(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1726,6 +1796,53 @@ def test_flash_attn_2_generate_use_cache(self): use_cache=True, ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + audio_encoder_attn = "sdpa" if model.audio_encoder._supports_sdpa else "eager" + text_encoder_attn = "sdpa" if model.text_encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model_sdpa.audio_encoder.config._attn_implementation == audio_encoder_attn) + self.assertTrue(model_sdpa.text_encoder.config._attn_implementation == text_encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.audio_encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.text_encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -1792,8 +1909,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) model_sdpa = model_sdpa.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, @@ -1801,20 +1916,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 35af9fe0768da8..f53fc21ba80c09 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -311,7 +311,9 @@ def test_flash_attn_2_inference_equivalence(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", ) model_fa.to(torch_device) @@ -391,7 +393,9 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", ) model_fa.to(torch_device) @@ -454,148 +458,10 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow - # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_inference + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.has_attentions: self.skipTest(reason="Model architecture does not support attentions") @@ -658,8 +524,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) model_sdpa = model_sdpa.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, @@ -667,20 +531,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] @@ -839,74 +689,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def prepare_musicgen_melody_inputs_dict( config, @@ -1048,6 +830,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester # not to test torchscript as the model tester doesn't prepare `input_features` and `padding_mask` # (and `torchscript` hates `None` values). test_torchscript = False + _is_composite = True def setUp(self): self.model_tester = MusicgenMelodyTester(self) @@ -1406,7 +1189,7 @@ def test_save_load_fast_init_from_base(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence def test_flash_attn_2_inference_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -1418,7 +1201,9 @@ def test_flash_attn_2_inference_equivalence(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, ) model_fa.to(torch_device) @@ -1491,7 +1276,88 @@ def test_flash_attn_2_inference_equivalence(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_conversion(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, + ).to(torch_device) + + for _, module in model.named_modules(): + if "FlashAttention" in module.__class__.__name__: + return + + self.assertTrue(False, "FlashAttention2 modules not found in model") + + @require_torch_sdpa + @require_torch_gpu + @slow + def test_sdpa_can_dispatch_on_flash(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + torch.compiler.reset() + compute_capability = torch.cuda.get_device_capability() + major, _ = compute_capability + + if not torch.version.cuda or major < 8: + self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0") + + for model_class in self.all_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]: + self.skipTest( + reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input" + ) + if config.model_type in ["paligemma"]: + self.skipTest( + "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input" + ) + if config.model_type in ["idefics", "idefics2", "idefics3"]: + self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input") + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation={"decoder": "sdpa", "audio_encoder": None, "text_encoder": None}, + ) + model.to(torch_device) + + inputs_dict.pop("attention_mask", None) + inputs_dict.pop("decoder_attention_mask", None) + + for name, inp in inputs_dict.items(): + if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]: + inputs_dict[name] = inp.to(torch.float16) + + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + _ = model(**inputs_dict) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding def test_flash_attn_2_inference_equivalence_right_padding(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -1503,7 +1369,9 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_fa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + tmpdirname, + torch_dtype=torch.bfloat16, + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, ) model_fa.to(torch_device) @@ -1573,7 +1441,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding def test_flash_attn_2_generate_left_padding(self): # Ignore copy for model_class in self.greedy_sample_model_classes: @@ -1608,7 +1476,7 @@ def test_flash_attn_2_generate_left_padding(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1622,7 +1490,7 @@ def test_flash_attn_2_generate_left_padding(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right def test_flash_attn_2_generate_padding_right(self): # Ignore copy for model_class in self.greedy_sample_model_classes: @@ -1656,7 +1524,7 @@ def test_flash_attn_2_generate_padding_right(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1670,7 +1538,7 @@ def test_flash_attn_2_generate_padding_right(self): @require_torch_gpu @mark.flash_attn_test @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache + # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache def test_flash_attn_2_generate_use_cache(self): max_new_tokens = 30 @@ -1699,7 +1567,7 @@ def test_flash_attn_2_generate_use_cache(self): model = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, - attn_implementation="flash_attention_2", + attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, low_cpu_mem_usage=True, ).to(torch_device) @@ -1712,6 +1580,53 @@ def test_flash_attn_2_generate_use_cache(self): use_cache=True, ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + audio_encoder_attn = "sdpa" if model.audio_encoder._supports_sdpa else "eager" + text_encoder_attn = "sdpa" if model.text_encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model_sdpa.audio_encoder.config._attn_implementation == audio_encoder_attn) + self.assertTrue(model_sdpa.text_encoder.config._attn_implementation == text_encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.audio_encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.text_encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -1775,8 +1690,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) model_sdpa = model_sdpa.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, @@ -1784,20 +1697,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 644ac2cc5bd1b4..cfc2a2c29b1d70 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -187,6 +187,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes test_pruning = False test_torchscript = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = PaliGemmaVisionText2TextModelTester(self) @@ -319,6 +320,16 @@ def test_generate_from_inputs_embeds_with_static_cache(self): def test_static_cache_matches_dynamic(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @slow @require_torch diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 4054055082c781..314f870f5d9096 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -15,6 +15,7 @@ """Testing suite for the PyTorch Qwen2Audio model.""" import gc +import tempfile import unittest from io import BytesIO from urllib.request import urlopen @@ -29,6 +30,7 @@ ) from transformers.testing_utils import ( require_torch, + require_torch_sdpa, slow, torch_device, ) @@ -152,6 +154,7 @@ class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.Tes all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = Qwen2AudioModelTester(self) @@ -165,6 +168,53 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + # overwrite because Qwen2 is audio+text model (not vision+text) + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + @require_torch class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 9d1e3109b313c3..2fe06b1511a471 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -71,6 +71,51 @@ class SiglipModelTesterMixin(ModelTesterMixin): + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # Load the model with SDPA + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # Load model with eager attention + model_eager = model_class.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + # SigLip has one shared cls attr for all models, so we assign both submodels heer + vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager" + + if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"): + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn) + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.text_model.config._attn_implementation == "eager") + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + def test_eager_matches_sdpa_inference( self, torch_dtype: str, @@ -132,23 +177,6 @@ def get_mean_reldiff(msg, current_case, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time, # but it would be nicer to have an efficient way to use parameterized.expand cases = [ @@ -400,6 +428,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(False,), ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + class SiglipTextModelTester: def __init__( @@ -562,6 +594,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(False, True), ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + class SiglipModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): @@ -629,6 +665,7 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test test_cpu_offload = False test_disk_offload_safetensors = False test_disk_offload_bin = False + _is_composite = True # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip def setUp(self): @@ -851,6 +888,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): use_attention_mask_options=(False, True), ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + class SiglipForImageClassificationModelTester(SiglipModelTester): def __init__(self, parent): @@ -888,6 +929,7 @@ class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTest test_cpu_offload = False test_disk_offload_safetensors = False test_disk_offload_bin = False + _is_composite = True def setUp(self): self.model_tester = SiglipForImageClassificationModelTester(self) @@ -925,6 +967,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): torch_dtype=torch_dtype, logit_keys=("logits",), use_attention_mask_options=(False,) ) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py index b193cacfb40042..6e0b7fa9782fbc 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py @@ -18,7 +18,13 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_deterministic_for_xpu, require_torch, slow, torch_device +from transformers.testing_utils import ( + require_deterministic_for_xpu, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from ..bert.test_modeling_bert import BertModelTester @@ -441,6 +447,66 @@ def test_real_model_save_load_from_pretrained(self): max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + inputs_dict = self.prepare_config_and_inputs() + encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"] + config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs( + encoder_config=encoder_config, decoder_config=decoder_config + ) + model = SpeechEncoderDecoderModel(config=config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = SpeechEncoderDecoderModel.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # see https://github.com/huggingface/transformers/pull/32238 + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + + # Also test that nothing break if we request SDPA explicitly, when both sub-parts support it. + # If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely + # Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support + if encoder_attn == "sdpa" and decoder_attn == "sdpa": + model_sdpa_explicit = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device) + + self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa") + else: + with self.assertRaises(ValueError): + model_sdpa_explicit = SpeechEncoderDecoderModel.from_pretrained( + tmpdirname, attn_implementation="sdpa" + ) + + model_eager = SpeechEncoderDecoderModel.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + @require_torch class Wav2Vec2BertModelTest(EncoderDecoderMixin, unittest.TestCase): diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 492dcb9bae1f92..1bd01843981deb 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -206,6 +206,7 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe test_pruning = False test_resize_embeddings = True test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = VideoLlavaVisionText2TextModelTester(self) @@ -237,6 +238,16 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @unittest.skip( reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`" ) diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 862e144ecdd7d8..2c241c23f26158 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -168,6 +168,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest test_pruning = False test_resize_embeddings = True test_head_masking = False + _is_composite = True def setUp(self): self.model_tester = VipLlavaVisionText2TextModelTester(self) @@ -242,6 +243,16 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index e5bc88d5bfb272..7def8a9ac96507 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -27,17 +27,24 @@ require_nltk, require_sentencepiece, require_torch, + require_torch_sdpa, require_vision, slow, to_2tuple, torch_device, ) -from transformers.utils import cached_property, is_torch_available, is_vision_available +from transformers.utils import ( + cached_property, + is_torch_available, + is_vision_available, +) from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from ..bart.test_modeling_bart import BartModelTester from ..bert.test_modeling_bert import BertModelTester from ..deit.test_modeling_deit import DeiTModelTester +from ..donut.test_modeling_donut_swin import DonutSwinModelTester +from ..gpt2.test_modeling_gpt2 import GPT2ModelTester from ..layoutlmv3.test_modeling_layoutlmv3 import LayoutLMv3ModelTester from ..swin.test_modeling_swin import SwinModelTester from ..trocr.test_modeling_trocr import TrOCRStandaloneDecoderModelTester @@ -53,6 +60,8 @@ BartForCausalLM, BertLMHeadModel, DeiTModel, + DonutSwinModel, + GPT2LMHeadModel, LayoutLMv3Model, SwinModel, TrOCRForCausalLM, @@ -72,6 +81,8 @@ @require_torch class EncoderDecoderMixin: + supports_sdpa = False + def get_encoder_decoder_model(self, config, decoder_config): pass @@ -374,6 +385,69 @@ def test_real_model_save_load_from_pretrained(self): max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + if not self.supports_sdpa: + self.skipTest("SDPA is not supported") + + inputs_dict = self.prepare_config_and_inputs() + encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"] + config = VisionEncoderDecoderConfig.from_encoder_decoder_configs( + encoder_config=encoder_config, decoder_config=decoder_config + ) + model = VisionEncoderDecoderModel(config=config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = VisionEncoderDecoderModel.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # see https://github.com/huggingface/transformers/pull/32238 + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + + # Also test that nothing break if we request SDPA explicitly, when both sub-parts support it. + # If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely + # Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support + if encoder_attn == "sdpa" and decoder_attn == "sdpa": + model_sdpa_explicit = VisionEncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device) + + self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa") + else: + with self.assertRaises(ValueError): + model_sdpa_explicit = VisionEncoderDecoderModel.from_pretrained( + tmpdirname, attn_implementation="sdpa" + ) + + model_eager = VisionEncoderDecoderModel.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + @require_torch class DeiT2RobertaModelTest(EncoderDecoderMixin, unittest.TestCase): @@ -497,6 +571,8 @@ def prepare_config_and_inputs(self): @require_torch class ViT2BertModelTest(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True # one submodel support SDPA + def get_pretrained_model_and_inputs(self): model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-bert" @@ -649,6 +725,8 @@ def test_real_model_save_load_from_pretrained(self): @require_torch class ViT2TrOCR(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True # one submodel support SDPA + def get_encoder_decoder_model(self, config, decoder_config): encoder_model = ViTModel(config).eval() decoder_model = TrOCRForCausalLM(decoder_config).eval() @@ -804,6 +882,240 @@ def test_real_model_save_load_from_pretrained(self): pass +@require_torch +class VIT2GPT2Test(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True # both submodels support SDPA + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = ViTModel(config).eval() + decoder_model = GPT2LMHeadModel(decoder_config).eval() + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = ViTModelTester(self, batch_size=13) + model_tester_decoder = GPT2ModelTester(self, batch_size=13, hidden_size=32, max_position_embeddings=512) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs() + config, pixel_values, labels = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + decoder_head_mask, + decoder_token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "pixel_values": pixel_values, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "decoder_head_mask": decoder_head_mask, + "labels": decoder_input_ids, + } + + def check_encoder_decoder_model_output_attentions( + self, + config, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + pixel_values, + labels=None, + **kwargs, + ): + # make the decoder inputs a different shape from the encoder inputs to harden the test + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + pixel_values=pixel_values, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=True, + **kwargs, + ) + + encoder_attentions = outputs_encoder_decoder["encoder_attentions"] + self.assertEqual(len(encoder_attentions), config.num_hidden_layers) + + seq_len = (encoder_model.config.image_size // encoder_model.config.patch_size) ** 2 + 1 + + decoder_attentions = outputs_encoder_decoder["decoder_attentions"] + num_decoder_layers = ( + decoder_config.num_decoder_layers + if hasattr(decoder_config, "num_decoder_layers") + else decoder_config.num_hidden_layers + ) + self.assertEqual(len(decoder_attentions), num_decoder_layers) + + self.assertEqual( + decoder_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), + ) + + cross_attentions = outputs_encoder_decoder["cross_attentions"] + self.assertEqual(len(cross_attentions), num_decoder_layers) + + cross_attention_input_seq_len = decoder_input_ids.shape[-1] + self.assertEqual( + cross_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len), # 4 6 16 + ) + + def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + # Generate until max length + if hasattr(enc_dec_model.config, "eos_token_id"): + enc_dec_model.config.eos_token_id = None + if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"): + enc_dec_model.config.decoder.eos_token_id = None + if hasattr(enc_dec_model.generation_config, "eos_token_id"): + enc_dec_model.generation_config.eos_token_id = None + enc_dec_model.to(torch_device) + + generated_output = enc_dec_model.generate( + pixel_values=pixel_values, + decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + **kwargs, + ) + self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) + + @unittest.skip(reason="VIT2GPT2 also has an integration test for testinf save-load") + def test_real_model_save_load_from_pretrained(self): + pass + + +@require_torch +class Donut2GPT2Test(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True # one submodel (GPT2) support SDPA + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = DonutSwinModel(config).eval() + decoder_model = GPT2LMHeadModel(decoder_config).eval() + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = DonutSwinModelTester(self, batch_size=13) + model_tester_decoder = GPT2ModelTester(self, batch_size=13, hidden_size=32, max_position_embeddings=512) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs() + config, pixel_values, labels = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + decoder_head_mask, + decoder_token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "pixel_values": pixel_values, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "decoder_head_mask": decoder_head_mask, + "labels": decoder_input_ids, + } + + def check_encoder_decoder_model_output_attentions( + self, + config, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + pixel_values, + labels=None, + **kwargs, + ): + # make the decoder inputs a different shape from the encoder inputs to harden the test + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + pixel_values=pixel_values, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=True, + **kwargs, + ) + + encoder_attentions = outputs_encoder_decoder["encoder_attentions"] + self.assertEqual(len(encoder_attentions), config.num_hidden_layers) + + seq_len = encoder_model.config.image_size // encoder_model.config.patch_size + + decoder_attentions = outputs_encoder_decoder["decoder_attentions"] + num_decoder_layers = ( + decoder_config.num_decoder_layers + if hasattr(decoder_config, "num_decoder_layers") + else decoder_config.num_hidden_layers + ) + self.assertEqual(len(decoder_attentions), num_decoder_layers) + + self.assertEqual( + decoder_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), + ) + + cross_attentions = outputs_encoder_decoder["cross_attentions"] + self.assertEqual(len(cross_attentions), num_decoder_layers) + + cross_attention_input_seq_len = decoder_input_ids.shape[-1] + self.assertEqual( + cross_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len), # 4 6 16 + ) + + def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + # Generate until max length + if hasattr(enc_dec_model.config, "eos_token_id"): + enc_dec_model.config.eos_token_id = None + if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"): + enc_dec_model.config.decoder.eos_token_id = None + if hasattr(enc_dec_model.generation_config, "eos_token_id"): + enc_dec_model.generation_config.eos_token_id = None + enc_dec_model.to(torch_device) + + generated_output = enc_dec_model.generate( + pixel_values=pixel_values, + decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + **kwargs, + ) + self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) + + @unittest.skip(reason="Donut has an Integration test for that") + def test_real_model_save_load_from_pretrained(self): + pass + + @require_vision @require_torch class TrOCRModelIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 104923957568aa..dec1482f562a33 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -207,6 +207,7 @@ class ModelTesterMixin: test_model_parallel = False is_encoder_decoder = False has_attentions = True + _is_composite = False model_split_percents = [0.5, 0.7, 0.9] def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): @@ -3006,6 +3007,7 @@ def test_inputs_embeds_matches_input_ids_with_generate(self): *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), ]: continue + model = model_class(config) model.to(torch_device) model.eval() @@ -3950,6 +3952,147 @@ def test_flash_attn_2_generate_padding_right(self): self.assertTrue(torch.allclose(out, out_fa)) + def test_attn_implementation_composite_models(self): + """ + Tests if composite models can receive a dict object as attn_implementation, where each key should be + one of the sub-configs from the model's config. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + for model_class in self.all_model_classes: + if not self._is_composite: + self.skipTest("Model is not a composite model.") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + sub_configs = { + key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig) + } + + # set eager as it will be the one supported in all models + # we just need to test if passing 'attn_implementation' as a dict fails or not + attn_implementation_per_subconfig = {} + for key, sub_config in sub_configs.items(): + attn_implementation_per_subconfig[key] = "eager" + + config._attn_implementation = attn_implementation_per_subconfig + model = model_class(config) + for key in model.config: + if isinstance(getattr(model.config, key), PretrainedConfig): + sub_config = getattr(model.config, key) + self.assertTrue(sub_config._attn_implementation == "eager") + + for name, submodule in model.named_modules(): + class_name = submodule.__class__.__name__ + if ( + "SdpaAttention" in class_name + or "SdpaSelfAttention" in class_name + or "FlashAttention" in class_name + ): + raise ValueError("The eager model should not have SDPA/FA2 attention layers") + + @require_torch_sdpa + def test_sdpa_can_dispatch_non_composite_models(self): + """ + Tests if non-composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self.all_model_classes[0]._supports_sdpa or self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + vision_model_names = {"visual", "image_tower", "vision_tower", "vision_model"} + language_model_names = {"language_model", "model", "text_model"} + vision_model_name = [name for name in vision_model_names if hasattr(model_sdpa, name)][0] + language_model_name = [name for name in language_model_names if hasattr(model_sdpa, name)][0] + + vision_model_sdpa = getattr(model, vision_model_name) + language_model_sdpa = getattr(model, language_model_name) + text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager" + vision_attn = "sdpa" if vision_model_sdpa._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(language_model_sdpa.config._attn_implementation == text_attn) + self.assertTrue(vision_model_sdpa.config._attn_implementation == vision_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(getattr(model_eager, language_model_name).config._attn_implementation == "eager") + self.assertTrue(getattr(model_eager, vision_model_name).config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any(module_attn == "sdpa" for module_attn in [text_attn, vision_attn]): + raise ValueError("The SDPA model should have SDPA attention layers") + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -4012,7 +4155,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code. # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it. deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters - is_encoder_decoder = model.config.is_encoder_decoder with tempfile.TemporaryDirectory() as tmpdirname: @@ -4020,8 +4162,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) model_sdpa = model_sdpa.eval().to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, @@ -4029,22 +4169,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] @@ -4279,7 +4403,7 @@ def test_sdpa_can_dispatch_on_flash(self): self.skipTest( "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input" ) - if config.model_type in ["idefics"]: + if config.model_type in ["idefics", "idefics2", "idefics3"]: self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input") model = model_class(config) @@ -4382,8 +4506,6 @@ def test_eager_matches_sdpa_generate(self): low_cpu_mem_usage=True, ).to(torch_device) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, @@ -4391,22 +4513,6 @@ def test_eager_matches_sdpa_generate(self): attn_implementation="eager", ).to(torch_device) - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - # Just test that a large cache works as expected res_eager = model_eager.generate( dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False @@ -4429,6 +4535,8 @@ def test_sdpa_matches_eager_sliding_window(self): self.skipTest(f"No generative model classes for {self.__class__.__name__}") for model_class in self.all_generative_model_classes: + if model_class._supports_sdpa: + self.skipTest(reason="Model architecture does not support attentions") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if config.model_type not in WINDOW_ATTENTION_MODELS: @@ -4531,6 +4639,62 @@ def test_flash_attn_2_generate_use_cache(self): use_cache=True, ) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_can_dispatch_composite_models(self): + """ + Tests if composite models can dispatch on FA2 if the sub-models support FA2. + The tests is needed as we handle differently composite models and we cannot check them + with above tests. If any of the sub-models does not support FA2, we'll raise an error when dispatching + that particular sub-model. Otherwise we dispatch safely in all sub-models, where "sub-models" are specific + backbone models (LM/vision/audio/etc) + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + torch_dtype = torch.float16 + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + if not self._is_composite: + self.skipTest("This model is not a composte model!") + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + + supports_fa2_all_modules = all( + module._supports_flash_attn_2 + for name, module in model.named_modules() + if isinstance(module, PreTrainedModel) and name != "" + ) + if not supports_fa2_all_modules: + with self.assertRaises(ValueError): + model_fa2 = model_class.from_pretrained( + tmpdirname, torch_dtype=torch_dtype, attn_implementation="flash_attention_2" + ) + else: + model_fa2 = model_class.from_pretrained( + tmpdirname, torch_dtype=torch_dtype, attn_implementation="flash_attention_2" + ) + for key in model_fa2.config: + if isinstance(getattr(model_fa2.config, key), PretrainedConfig): + sub_config = getattr(model_fa2.config, key) + self.assertTrue(sub_config._attn_implementation == "flash_attention_2") + + has_fa2 = False + for name, submodule in model_fa2.named_modules(): + class_name = submodule.__class__.__name__ + if "FlashAttention" in class_name: + has_fa2 = True + break + if not has_fa2: + raise ValueError("The FA2 model should have FA2 layers") + @require_flash_attn @require_torch_gpu @mark.flash_attn_test @@ -4679,7 +4843,7 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): if 0 in inputs_dict["attention_mask"][:, -1]: inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1) dummy_attention_mask = inputs_dict["attention_mask"] - inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.pad_token_id + inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id model = ( model_class.from_pretrained( diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py index d2701bf35e6603..35a651d0e59873 100644 --- a/tests/utils/test_configuration_utils.py +++ b/tests/utils/test_configuration_utils.py @@ -228,6 +228,7 @@ def test_config_common_kwargs_is_complete(self): "_name_or_path", "_commit_hash", "_attn_implementation_internal", + "_attn_implementation_autoset", "transformers_version", ], ) diff --git a/utils/check_repo.py b/utils/check_repo.py index 6872dada3d9384..10be5cdcd26230 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -82,6 +82,8 @@ "SeamlessM4Tv2TextToUnitModel", "SeamlessM4Tv2CodeHifiGan", "SeamlessM4Tv2TextToUnitForConditionalGeneration", + "Idefics2PerceiverResampler", + "Idefics2VisionTransformer", "Idefics3VisionTransformer", ] @@ -225,7 +227,6 @@ "BeitForMaskedImageModeling", "ChineseCLIPTextModel", "ChineseCLIPVisionModel", - "CLIPTextModel", "CLIPTextModelWithProjection", "CLIPVisionModelWithProjection", "ClvpForCausalLM", @@ -327,6 +328,7 @@ "SiglipVisionModel", "SiglipTextModel", "ChameleonVQVAE", # no autoclass for VQ-VAE models + "CLIPTextModel", "MoshiForConditionalGeneration", # no auto class for speech-to-speech ] From 5077bc034fd806a8b0155e71b963596a765a7758 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 22 Oct 2024 07:56:35 +0200 Subject: [PATCH 060/385] VLM: add more modularity (#34175) * update * fix tests + fix copies * fix tests once more --- .../models/llava/modeling_llava.py | 14 + .../models/llava_next/modeling_llava_next.py | 85 ++++-- .../modeling_llava_next_video.py | 129 ++++++--- .../modular_llava_next_video.py | 83 ++++-- .../modeling_llava_onevision.py | 142 +++++++--- .../models/paligemma/modeling_paligemma.py | 21 +- .../video_llava/modeling_video_llava.py | 262 +++++++++--------- .../models/vipllava/modeling_vipllava.py | 11 + 8 files changed, 491 insertions(+), 256 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index c17d35296a9c77..50b3d4c6a89533 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -273,6 +273,20 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[vision_feature_layer] diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 04ff098170b7a3..0cbda9cfd64b74 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -705,6 +705,57 @@ def pack_image_features(self, image_features, image_sizes, vision_feature_select feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device) return image_features, feature_lens + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: int, + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + return image_features + @add_start_docstrings_to_model_forward(LLAVA_NEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=LlavaNextCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -796,34 +847,12 @@ def forward( ) or (input_ids.shape[-1] == 1 and pixel_values is not None) if pixel_values is not None and pixel_values.size(0) > 0: - # ! infer image_num_patches from image_sizes - image_num_patches = [ - image_size_to_num_patches( - image_size=imsize, - grid_pinpoints=self.config.image_grid_pinpoints, - patch_size=self.config.vision_config.image_size, - ) - for imsize in image_sizes - ] - # figure out if pixel_values is concatenated or stacked - if pixel_values.dim() == 5: - # stacking when input is (batch_size, num_patches, num_channels, height, width) - _pixel_values_list = [ - pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches) - ] - pixel_values = torch.cat(_pixel_values_list, dim=0) - elif pixel_values.dim() != 4: - # otherwise has to be stacked from list of (num_patches, num_channels, height, width) - raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") - - image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] - if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - image_features = self.multi_modal_projector(selected_image_feature) - image_features = torch.split(image_features, image_num_patches, dim=0) + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" image_features, feature_lens = self.pack_image_features( diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 8d3bfb1efa4e85..96f4373afd9ec6 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -744,6 +744,57 @@ def pack_image_features(self, image_features, image_sizes, vision_feature_select feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device) return image_features, feature_lens + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: int, + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + return image_features + @add_start_docstrings_to_model_forward(LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=LlavaNextVideoCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -883,7 +934,12 @@ def forward( image_features = feature_lens = None if pixel_values is not None and pixel_values.size(0) > 0: - image_features = self._get_image_features(pixel_values, image_sizes) + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=self.vision_feature_layer, + vision_feature_select_strategy=self.vision_feature_select_strategy, + ) image_features, feature_lens = self.pack_image_features( image_features, image_sizes, @@ -893,7 +949,11 @@ def forward( video_features = video_feature_lens = None if pixel_values_videos is not None and pixel_values_videos.size(0) > 0: - video_features = self._get_video_features(pixel_values_videos) + video_features = self.get_video_features( + pixel_values_videos, + vision_feature_layer=self.vision_feature_layer, + vision_feature_select_strategy=self.vision_feature_select_strategy, + ) video_features = [feature.flatten(0, 1) for feature in video_features] video_feature_lens = [feature.size(0) for feature in video_features] video_features = torch.cat(video_features, dim=0) @@ -1080,46 +1140,35 @@ def prepare_inputs_for_generation( return model_inputs - def _get_image_features(self, pixel_values, image_sizes): - # ! infer image_num_patches from image_sizes - image_num_patches = [ - image_size_to_num_patches( - image_size=imsize, - grid_pinpoints=self.config.image_grid_pinpoints, - patch_size=self.config.vision_config.image_size, - ) - for imsize in image_sizes - ] - if pixel_values.dim() == 5: - # stacked if input is (batch_size, num_patches, num_channels, height, width) - _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] - pixel_values = torch.cat(_pixel_values_list, dim=0) - elif pixel_values.dim() != 4: - # otherwise has to be stacked from list of (num_patches, num_channels, height, width) - raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") - - image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[self.vision_feature_layer] - if self.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - image_features = self.multi_modal_projector(selected_image_feature) - image_features = torch.split(image_features, image_num_patches, dim=0) - return image_features + def get_video_features( + self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + ): + """ + Obtains video last hidden states from the vision tower and apply multimodal projection. - def _get_video_features(self, pixel_values): + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) + The tensors corresponding to the input video. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches + and are of shape `(num_videos, video_length, embed_dim)`). + """ batch_size, frames, channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width) - image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[self.vision_feature_layer] - if self.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature + video_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_video_features = video_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_video_features = selected_video_features[:, 1:] + elif vision_feature_select_strategy == "full": + selected_video_features = selected_video_features # Same as image features except that video has pooling layer - image_features = self.vision_resampler(selected_image_feature) - image_features = self.multi_modal_projector(image_features) - image_features = torch.split(image_features, frames, dim=0) - return image_features + video_features = self.vision_resampler(selected_video_features) + video_features = self.multi_modal_projector(video_features) + video_features = torch.split(video_features, frames, dim=0) + return video_features diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index ec5a05733ec878..c1ed7571941b9e 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -225,7 +225,30 @@ def __init__(self, config: LlavaNextVideoConfig, **super_kwargs): self.vision_resampler = LlavaNextVideoPooler(config) self.post_init() - def _get_image_features(self, pixel_values, image_sizes): + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: int, + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ # ! infer image_num_patches from image_sizes image_num_patches = [ image_size_to_num_patches( @@ -244,30 +267,47 @@ def _get_image_features(self, pixel_values, image_sizes): raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[self.vision_feature_layer] - if self.vision_feature_select_strategy == "default": + selected_image_feature = image_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": selected_image_feature = selected_image_feature[:, 1:] - elif self.vision_feature_select_strategy == "full": + elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) image_features = torch.split(image_features, image_num_patches, dim=0) return image_features - def _get_video_features(self, pixel_values): + def get_video_features( + self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + ): + """ + Obtains video last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) + The tensors corresponding to the input video. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches + and are of shape `(num_videos, video_length, embed_dim)`). + """ batch_size, frames, channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width) - image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[self.vision_feature_layer] - if self.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature + video_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_video_features = video_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_video_features = selected_video_features[:, 1:] + elif vision_feature_select_strategy == "full": + selected_video_features = selected_video_features # Same as image features except that video has pooling layer - image_features = self.vision_resampler(selected_image_feature) - image_features = self.multi_modal_projector(image_features) - image_features = torch.split(image_features, frames, dim=0) - return image_features + video_features = self.vision_resampler(selected_video_features) + video_features = self.multi_modal_projector(video_features) + video_features = torch.split(video_features, frames, dim=0) + return video_features @replace_return_docstrings(output_type=LlavaNextVideoCausalLMOutputWithPast, config_class="LlavaNextVideoConfig") def forward( @@ -407,7 +447,12 @@ def forward( image_features = feature_lens = None if pixel_values is not None and pixel_values.size(0) > 0: - image_features = self._get_image_features(pixel_values, image_sizes) + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=self.vision_feature_layer, + vision_feature_select_strategy=self.vision_feature_select_strategy, + ) image_features, feature_lens = self.pack_image_features( image_features, image_sizes, @@ -417,7 +462,11 @@ def forward( video_features = video_feature_lens = None if pixel_values_videos is not None and pixel_values_videos.size(0) > 0: - video_features = self._get_video_features(pixel_values_videos) + video_features = self.get_video_features( + pixel_values_videos, + vision_feature_layer=self.vision_feature_layer, + vision_feature_select_strategy=self.vision_feature_select_strategy, + ) video_features = [feature.flatten(0, 1) for feature in video_features] video_feature_lens = [feature.size(0) for feature in video_features] video_features = torch.cat(video_features, dim=0) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 2c5fa511467aff..946688bfcf07f4 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -481,6 +481,91 @@ def apply_pooling(self, image_features): image_features = image_features.view(batch_frames, -1, dim) return image_features + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: int, + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + return image_features + + def get_video_features( + self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + ): + """ + Obtains video last hidden states from the vision tower, apply multimodal projection and pooling. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) + The tensors corresponding to the input video. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches + and are of shape `(num_videos, video_length, embed_dim)`). + """ + batch_size, frames, channels, height, width = pixel_values.shape + pixel_values = pixel_values.view(batch_size * frames, channels, height, width) + video_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_video_feature = video_features.hidden_states[vision_feature_layer] + + if vision_feature_select_strategy == "default": + selected_video_feature = selected_video_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_video_feature = selected_video_feature + video_features = self.multi_modal_projector(selected_video_feature) + + video_features = self.apply_pooling(video_features) + video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1) + + return video_features + @add_start_docstrings(LLAVA_ONEVISION_INPUTS_DOCSTRING) def forward( self, @@ -580,35 +665,12 @@ def forward( # Images are processed with Anyres if pixel_values is not None: - image_num_patches = [ - image_size_to_num_patches( - image_size=imsize, - grid_pinpoints=self.config.image_grid_pinpoints, - patch_size=self.config.vision_config.image_size, - ) - for imsize in image_sizes - ] - - # unpad extra patches and concatenate them - if pixel_values.dim() == 5: - _pixel_values_list = [ - pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches) - ] - # [batch_size*frames*num_patches, num_channels, height, width] where frames=1 for images - pixel_values = torch.cat(_pixel_values_list, dim=0) - elif pixel_values.dim() != 4: - raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") - - image_features = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_features.hidden_states[vision_feature_layer] - - if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - image_features = self.multi_modal_projector(selected_image_feature) - - image_features = torch.split(image_features, image_num_patches, dim=0) + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) image_features, feature_lens = self.pack_image_features( image_features, image_sizes, @@ -632,20 +694,14 @@ def forward( # Video are simply embedded and further pooled to decrease seq len if pixel_values_videos is not None: - batch_size, frames, channels, height, width = pixel_values_videos.shape - pixel_values_videos = pixel_values_videos.view(batch_size * frames, channels, height, width) - video_features = self.vision_tower(pixel_values_videos, output_hidden_states=True) - selected_video_feature = video_features.hidden_states[vision_feature_layer] - - if vision_feature_select_strategy == "default": - selected_video_feature = selected_video_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_video_feature = selected_video_feature - video_features = self.multi_modal_projector(selected_video_feature) - - video_features = self.apply_pooling(video_features) - video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1) - image_newline = self.image_newline[None, None, :].repeat(batch_size, 1, 1).to(video_features.device) + video_features = self.get_video_features( + pixel_values_videos, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_newline = ( + self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device) + ) video_features = torch.cat((video_features, image_newline), dim=1) video_features = video_features.flatten(0, 1) n_video_tokens = (input_ids == self.config.video_token_index).sum().item() diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index ffb4b7435f2a2a..e198dab420abe8 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -392,6 +392,22 @@ def _update_causal_mask( ) return causal_mask + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values) + selected_image_feature = image_outputs.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + image_features = image_features / (self.config.hidden_size**0.5) + return image_features + @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -477,10 +493,7 @@ def forward( # Merge text and images if pixel_values is not None: - image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype)) - selected_image_feature = image_outputs.last_hidden_state - image_features = self.multi_modal_projector(selected_image_feature) - image_features = image_features / (self.config.hidden_size**0.5) + image_features = self.get_image_features(pixel_values) special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1) special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index b455040059e653..c4ec1b5196929a 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...generation import GenerationMixin -from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput +from ...modeling_outputs import ModelOutput from ...modeling_utils import PreTrainedModel from ...utils import ( add_start_docstrings, @@ -355,41 +355,59 @@ def _merge_input_ids_with_visual_features( return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids - def _get_vision_features( - self, - pixel_values_images: Optional[torch.FloatTensor] = None, - pixel_values_videos: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, - vision_feature_select_strategy: Optional[str] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - if pixel_values_images is None and pixel_values_videos is None: - raise ValueError("You have to specify `pixel_values_images` or `pixel_values_videos`") + def get_image_features( + self, pixel_values_images: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. - # videos do not need to select features and it's always "full" (as it is done in the orig implementation) - if pixel_values_videos is not None: - batch_size_vid, num_frames, channels, height, width = pixel_values_videos.shape + Args: + pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ - pixel_values = pixel_values_videos.reshape(batch_size_vid * num_frames, channels, height, width) - video_outputs = self.video_tower(pixel_values, output_hidden_states=True) - video_outputs = video_outputs.hidden_states[vision_feature_layer].squeeze(1) + image_outputs = self.image_tower(pixel_values_images, output_hidden_states=True) + image_outputs = image_outputs.hidden_states[vision_feature_layer].squeeze(1) + + if vision_feature_select_strategy == "default": + image_outputs = image_outputs[:, 1:] + elif vision_feature_select_strategy == "full": + image_outputs = image_outputs else: - video_outputs = None - num_frames = 0 + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - if pixel_values_images is not None: - image_outputs = self.image_tower(pixel_values_images, output_hidden_states=True) - image_outputs = image_outputs.hidden_states[vision_feature_layer].squeeze(1) + image_features = self.multi_modal_projector(image_outputs) - if vision_feature_select_strategy == "default": - image_outputs = image_outputs[:, 1:] - elif vision_feature_select_strategy == "full": - image_outputs = image_outputs - else: - raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - else: - image_outputs = None + return image_features - return image_outputs, video_outputs, num_frames + def get_video_features(self, pixel_values_videos: torch.FloatTensor, vision_feature_layer: int): + """ + Obtains video last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) + The tensors corresponding to the input videos. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + Returns: + video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos * num_frames, image_length, embed_dim)`). + frames (`int`): Number of frames the videos have. + """ + batch_size_vid, num_frames, channels, height, width = pixel_values_videos.shape + + pixel_values = pixel_values_videos.reshape(batch_size_vid * num_frames, channels, height, width) + video_outputs = self.video_tower(pixel_values, output_hidden_states=True) + video_features = video_outputs.hidden_states[vision_feature_layer].squeeze(1) + video_features = self.multi_modal_projector(video_features) + + return video_features, num_frames @add_start_docstrings_to_model_forward(VIDEO_LLAVA_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=VideoLlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) @@ -534,110 +552,106 @@ def forward( ) legacy_processing = inputs_not_expanded or pixels_present - if pixel_values_images is not None or pixel_values_videos is not None: - image_outputs, video_outputs, num_frames = self._get_vision_features( - pixel_values_images=pixel_values_images, - pixel_values_videos=pixel_values_videos, + image_features = None + if pixel_values_images is not None: + image_features = self.get_image_features( + pixel_values_images, vision_feature_layer=vision_feature_layer, vision_feature_select_strategy=vision_feature_select_strategy, ) - image_features = video_features = None - if image_outputs is not None: - image_features = self.multi_modal_projector(image_outputs) - if video_outputs is not None: - video_features = self.multi_modal_projector(video_outputs) - - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." - ) - if input_ids.shape[1] != 1: - for features, frames in ((image_features, 1), (video_features, num_frames)): - if features is not None: - ( - inputs_embeds, - attention_mask, - labels, - position_ids, - input_ids, - ) = self._merge_input_ids_with_visual_features( - features, - inputs_embeds, - input_ids, - attention_mask, - labels, - num_frames=frames, - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) + video_features = None + if pixel_values_videos is not None: + video_features, num_frames = self.get_video_features( + pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer + ) - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + if input_ids.shape[1] != 1: + for features, frames in ((image_features, 1), (video_features, num_frames)): + if features is not None: + ( + inputs_embeds, + attention_mask, + labels, + position_ids, + input_ids, + ) = self._merge_input_ids_with_visual_features( + features, + inputs_embeds, + input_ids, + attention_mask, + labels, + num_frames=frames, + ) + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) + else: + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] - # TODO: @raushan retain only the new behavior after v4.47 - else: - if image_outputs is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # TODO: @raushan retain only the new behavior after v4.47 + else: + if pixel_values_images is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - if video_outputs is not None: - n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item() - n_video_features = video_features.shape[1] - if n_video_tokens != n_video_features: - raise ValueError( - f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" - ) - special_image_mask = ( - (input_ids == self.config.video_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + if pixel_values_videos is not None: + n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item() + n_video_features = video_features.shape[1] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" ) - video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features) + special_image_mask = ( + (input_ids == self.config.video_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features) outputs = self.language_model( attention_mask=attention_mask, diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 10935c0b63e076..dd7baa34406fb0 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -275,6 +275,17 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m # Ignore copy def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: List[int]): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layers (`List[int]`): + The list og indexes of the layers to select the vision feature. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) # For VIP-llava, the image features are computed this way From 73d65e637b63193289dbf6727297cb9ecdf4ff29 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 22 Oct 2024 08:23:53 +0200 Subject: [PATCH 061/385] T5 compile compatibilty (#34089) * this worked in normal generation, needs more tests * fix almost all tests in t5 * nit * longt5, umt5, mt5 * style * udop, pix2struct * more models * fix some tests * fix onnx tests * tracing tests fixed * compile enabled and tested for t5 models * fix small bug in slow tests * [run-slow] t5 * uncomment * style * update with new generation refactoring * nit * fix copies * this is the fix, had to change t5 to fix copies * update * [run-slow] t5 * [run-slow] t5 * update * add test for encoder only T5 * clean up after rebase * fix pop2piano * add comment * style * fix copies after rebase * fix copies missed this one --- src/transformers/cache_utils.py | 6 +- src/transformers/generation/utils.py | 8 +- .../models/longt5/configuration_longt5.py | 7 +- .../models/longt5/modeling_longt5.py | 455 ++++++++++++------ .../models/mt5/configuration_mt5.py | 7 +- src/transformers/models/mt5/modeling_mt5.py | 449 +++++++++++------ .../models/pix2struct/modeling_pix2struct.py | 407 +++++++++++----- .../models/pop2piano/modeling_pop2piano.py | 451 +++++++++++------ .../modeling_switch_transformers.py | 454 +++++++++++------ .../models/t5/configuration_t5.py | 7 +- src/transformers/models/t5/modeling_t5.py | 452 +++++++++++------ src/transformers/models/udop/modeling_udop.py | 434 +++++++++++------ .../models/umt5/configuration_umt5.py | 7 +- src/transformers/models/umt5/modeling_umt5.py | 401 +++++++++++---- tests/models/longt5/test_modeling_longt5.py | 38 +- tests/models/mt5/test_modeling_mt5.py | 46 +- .../pop2piano/test_modeling_pop2piano.py | 2 +- .../test_modeling_switch_transformers.py | 36 ++ tests/models/t5/test_modeling_t5.py | 114 ++++- tests/models/udop/test_modeling_udop.py | 74 +++ tests/models/umt5/test_modeling_umt5.py | 39 ++ tests/test_modeling_common.py | 27 +- 22 files changed, 2743 insertions(+), 1178 deletions(-) diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 4e4a1ee26c12d7..0f696cc3ac6a4d 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -1475,11 +1475,7 @@ def from_legacy_cache( def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # check if empty list because in case of static cache it will be a tensors and we can't check `if not torch.Tensor` - if self.self_attention_cache.key_cache == []: - return 0 - if len(self.self_attention_cache.key_cache) > 1 and self.self_attention_cache.key_cache[layer_idx] == []: - return 0 - return (self.self_attention_cache.key_cache[layer_idx][0, 0].any(dim=-1)).sum() + return self.self_attention_cache.get_seq_length(layer_idx) def reset(self): if hasattr(self.self_attention_cache, "reset"): diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 9ede527ecb7b80..c399a8a2c829c7 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1535,8 +1535,12 @@ def _prepare_generation_config( def _get_initial_cache_position(self, input_ids, model_kwargs): """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange` - if "inputs_embeds" in model_kwargs: + if "inputs_embeds" in model_kwargs and not self.config.is_encoder_decoder: cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1 + elif "decoder_inputs_embeds" in model_kwargs and self.config.is_encoder_decoder: + cache_position = ( + torch.ones_like(model_kwargs["decoder_inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1 + ) else: cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1 @@ -1633,7 +1637,7 @@ def get_layer_device_map(execution_device_map: Optional[dict] = None): cache_kwargs = { "config": self.config.get_text_config(), - "max_batch_size": batch_size, + "batch_size": batch_size, "max_cache_len": max_cache_len, "device": device, "dtype": cache_dtype, diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py index 0e541ae2a1b4fa..b6e7d21b3d677b 100644 --- a/src/transformers/models/longt5/configuration_longt5.py +++ b/src/transformers/models/longt5/configuration_longt5.py @@ -79,7 +79,12 @@ class LongT5Config(PretrainedConfig): model_type = "longt5" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + "head_dim": "d_kv", + } def __init__( self, diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index d351e798ac7f88..29536d9ad6f284 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -24,7 +24,9 @@ from torch.nn import CrossEntropyLoss from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -39,6 +41,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -317,7 +320,12 @@ def forward(self, hidden_states): # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->LongT5 class LongT5Attention(nn.Module): - def __init__(self, config: LongT5Config, has_relative_attention_bias=False): + def __init__( + self, + config: LongT5Config, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -328,6 +336,13 @@ def __init__(self, config: LongT5Config, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -404,11 +419,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -432,94 +450,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -529,22 +525,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -1008,9 +1004,11 @@ def unshape(states): # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->LongT5 class LongT5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = LongT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.SelfAttention = LongT5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -1023,6 +1021,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -1033,6 +1032,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -1042,7 +1042,7 @@ def forward( class LongT5LayerLocalSelfAttention(nn.Module): """Local self attention used in encoder""" - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.LocalSelfAttention = LongT5LocalAttention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) @@ -1073,7 +1073,7 @@ def forward( class LongT5LayerTransientGlobalSelfAttention(nn.Module): """Transient-Global self attention used in encoder""" - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.TransientGlobalSelfAttention = LongT5TransientGlobalAttention( config, has_relative_attention_bias=has_relative_attention_bias @@ -1105,9 +1105,9 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->LongT5 class LongT5LayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False) + self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -1122,6 +1122,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -1134,6 +1135,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -1141,7 +1143,7 @@ def forward( class LongT5Block(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder if config.is_decoder: @@ -1156,9 +1158,11 @@ def __init__(self, config, has_relative_attention_bias=False): f"but got {config.encoder_attention_type}." ) self.layer = nn.ModuleList() - self.layer.append(attention_layer(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append( + attention_layer(config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx) + ) if self.is_decoder: - self.layer.append(LongT5LayerCrossAttention(config)) + self.layer.append(LongT5LayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(LongT5LayerFF(config)) @@ -1176,34 +1180,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/ @@ -1213,35 +1202,25 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/ if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -1256,7 +1235,7 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs @@ -1273,6 +1252,8 @@ class LongT5PreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["LongT5Block"] + _supports_cache_class = True + _supports_static_cache = False # TODO: @raushan more involved due to local/global attn @property # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel.dummy_inputs @@ -1376,7 +1357,10 @@ def __init__(self, config, embed_tokens=None): self.block_len = self.local_radius + 1 self.block = nn.ModuleList( - [LongT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + [ + LongT5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) + for i in range(config.num_layers) + ] ) self.final_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -1408,6 +1392,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1430,36 +1415,65 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - - if use_cache is True: - assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder" + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) - if attention_mask is None: + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past + mask_seq_length = past_key_values_length + seq_length attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - # We use local attention in encoder self-attention, otherwise standard self & cross attentions are used if self.is_decoder: - extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape, inputs_embeds.device + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, ) + # We use local attention in encoder self-attention, otherwise standard self & cross attentions are used elif self.config.encoder_attention_type == "local": - extended_attention_mask = _get_local_attention_mask(attention_mask, self.block_len, inputs_embeds.device) + causal_mask = _get_local_attention_mask(attention_mask, self.block_len, inputs_embeds.device) else: # we need to use both local attention mask and standard extended mask for transient-global attention - extended_attention_mask = attention_mask + causal_mask = attention_mask # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -1472,17 +1486,9 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None @@ -1491,7 +1497,7 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] @@ -1502,7 +1508,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -1512,20 +1518,24 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + return_dict, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, ) # layer_outputs is a tuple with: @@ -1533,7 +1543,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -1541,9 +1551,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -1557,12 +1564,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1571,12 +1584,135 @@ def forward( ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + LONGT5_START_DOCSTRING = r""" @@ -1693,6 +1829,9 @@ def forward( more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ LONGT5_ENCODER_INPUTS_DOCSTRING = r""" @@ -1817,6 +1956,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: @@ -1883,6 +2023,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1975,6 +2116,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -2050,6 +2192,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py index ef629718b1b591..267179f81247e8 100644 --- a/src/transformers/models/mt5/configuration_mt5.py +++ b/src/transformers/models/mt5/configuration_mt5.py @@ -72,7 +72,12 @@ class MT5Config(PretrainedConfig): model_type = "mt5" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + "head_dim": "d_kv", + } def __init__( self, diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 9051414d7414fa..659a84c5fe3784 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -25,7 +25,9 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -43,6 +45,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -214,7 +217,12 @@ def forward(self, hidden_states): # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->MT5 class MT5Attention(nn.Module): - def __init__(self, config: MT5Config, has_relative_attention_bias=False): + def __init__( + self, + config: MT5Config, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -225,6 +233,13 @@ def __init__(self, config: MT5Config, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -301,11 +316,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -329,94 +347,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -426,22 +422,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -450,9 +446,11 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->MT5 class MT5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.SelfAttention = MT5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -465,6 +463,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -475,6 +474,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -483,9 +483,9 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->MT5 class MT5LayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = MT5Attention(config, has_relative_attention_bias=False) + self.EncDecAttention = MT5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -500,6 +500,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -512,6 +513,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -520,13 +522,15 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5Block with T5->MT5 class MT5Block(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() - self.layer.append(MT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append( + MT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx) + ) if self.is_decoder: - self.layer.append(MT5LayerCrossAttention(config)) + self.layer.append(MT5LayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(MT5LayerFF(config)) @@ -544,34 +548,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -585,25 +574,18 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: @@ -614,10 +596,6 @@ def forward( ) hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -636,11 +614,11 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + return outputs # hidden-states, past_key_value, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) def load_tf_weights_in_mt5(model, config, tf_checkpoint_path): @@ -780,6 +758,9 @@ class MT5PreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" is_parallelizable = True supports_gradient_checkpointing = True + _supports_quantized_cache = False # enc-dec models don't support yet + _supports_static_cache = True + _supports_cache_class = True _no_split_modules = ["MT5Block"] _keep_in_fp32_modules = ["wo"] @@ -892,7 +873,7 @@ def __init__(self, config, embed_tokens=None): self.is_decoder = config.is_decoder self.block = nn.ModuleList( - [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + [MT5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) for i in range(config.num_layers)] ) self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -968,6 +949,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): # Model parallel if self.model_parallel: @@ -994,6 +976,13 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: if self.embed_tokens is None: raise ValueError("You have to initialize the model with valid token embeddings") @@ -1001,23 +990,57 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: if not self.is_decoder: raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) - if attention_mask is None: + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + elif attention_mask is not None: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min + else: + causal_mask = None # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -1032,17 +1055,9 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None @@ -1051,15 +1066,15 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) # Ensure that attention_mask is always on the same device as hidden_states - if attention_mask is not None: - attention_mask = attention_mask.to(hidden_states.device) + if causal_mask is not None: + causal_mask = causal_mask.to(hidden_states.device) if position_bias is not None: position_bias = position_bias.to(hidden_states.device) if encoder_hidden_states is not None: @@ -1079,7 +1094,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -1089,20 +1104,24 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + return_dict, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, ) # layer_outputs is a tuple with: @@ -1110,7 +1129,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -1118,9 +1137,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -1140,12 +1156,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1154,12 +1176,135 @@ def forward( ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + MT5_START_DOCSTRING = r""" @@ -1454,6 +1599,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: @@ -1533,6 +1679,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1685,6 +1832,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1779,6 +1927,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 37090677a6e254..b1ac81bb1f21b6 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -22,7 +22,9 @@ from torch import nn from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPooling, @@ -38,6 +40,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -184,14 +187,17 @@ def to_projection_shape(states): if self.gradient_checkpointing and self.training: position_bias.requires_grad = True - if attention_mask is None: - attention_mask = torch.ones((batch_size, seq_length), device=scores.device, dtype=scores.dtype) - if attention_mask.dim() == 2: position_bias = position_bias + attention_mask[:, None, None, :].to(position_bias.device) - else: + elif attention_mask is not None: # (batch_size, n_heads, seq_length, key_length) position_bias = position_bias + attention_mask.to(position_bias.device) + elif not is_torchdynamo_compiling(): + attention_mask = torch.ones( + (batch_size, seq_length), device=position_bias.device, dtype=position_bias.dtype + ) + position_bias = position_bias + attention_mask.to(position_bias.device) + position_bias = 1 - position_bias position_bias_masked = position_bias.masked_fill(position_bias == 1, torch.finfo(scores.dtype).min) @@ -355,6 +361,8 @@ class Pix2StructPreTrainedModel(PreTrainedModel): """ config_class = Pix2StructConfig + _supports_cache_class = True + _supports_static_cache = False @property def dummy_inputs(self): @@ -673,7 +681,9 @@ def forward(self, hidden_states): class Pix2StructTextAttention(nn.Module): - def __init__(self, config: Pix2StructTextConfig, has_relative_attention_bias=False): + def __init__( + self, config: Pix2StructTextConfig, has_relative_attention_bias=False, layer_idx: Optional[int] = None + ): super().__init__() self.has_relative_attention_bias = has_relative_attention_bias self.relative_attention_num_buckets = config.relative_attention_num_buckets @@ -683,6 +693,13 @@ def __init__(self, config: Pix2StructTextConfig, has_relative_attention_bias=Fal self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.query = nn.Linear(self.hidden_size, self.hidden_size, bias=False) @@ -773,75 +790,56 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, query_length, key_length) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length - - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def to_projection_shape(states): - """projection""" - return states.contiguous().view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = to_projection_shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = to_projection_shape(proj_layer(key_value_states)) - - if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = to_projection_shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states + query_states = self.query(hidden_states).contiguous() + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - # get query states - # (batch_size, n_heads, seq_length, dim_per_head) - query_states = to_projection_shape(self.query(hidden_states)) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + past_key_value = past_key_value.cross_attention_cache + else: + past_key_value = past_key_value.self_attention_cache # get key/value states - key_states = project( - hidden_states, self.key, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.value, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_value.key_cache[self.layer_idx] + value_states = past_key_value.value_cache[self.layer_idx] + else: + key_states = self.key(current_states).contiguous() + value_states = self.value(current_states).contiguous() + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + real_seq_length = cache_position[-1] + 1 if query_length is None else query_length + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] if not self.has_relative_attention_bias: position_bias = torch.zeros( (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype @@ -851,11 +849,6 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): else: position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - if mask is not None: position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) @@ -883,19 +876,20 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): attn_output = self.output(attn_output) - present_key_value_state = (key_states, value_states) if use_cache else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output,) + (past_key_value,) + (position_bias,) if output_attentions: outputs = outputs + (attn_weights,) return outputs -# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,self.SelfAttention->self.attention,config.d_model->config.hidden_size +# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,T5LayerSelfAttention->Pix2StructTextLayerSelfAttention,self.SelfAttention->self.attention,config.d_model->config.hidden_size class Pix2StructTextLayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.attention = Pix2StructTextAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -908,6 +902,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( @@ -918,17 +913,18 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs -# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,self.EncDecAttention->self.attention,config.d_model->config.hidden_size +# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5LayerNorm->Pix2StructLayerNorm,T5Attention->Pix2StructTextAttention,T5LayerCrossAttention->Pix2StructTextLayerCrossAttention,self.EncDecAttention->self.attention,config.d_model->config.hidden_size class Pix2StructTextLayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=False) + self.attention = Pix2StructTextAttention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -943,6 +939,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( @@ -955,6 +952,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -962,11 +960,13 @@ def forward( class Pix2StructTextBlock(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.self_attention = Pix2StructTextLayerSelfAttention( - config, has_relative_attention_bias=has_relative_attention_bias + config, + has_relative_attention_bias=has_relative_attention_bias, + layer_idx=layer_idx, ) self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config) @@ -987,32 +987,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.self_attention( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -1022,35 +1009,25 @@ def forward( do_cross_attention = encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.encoder_decoder_attention( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -1065,7 +1042,7 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs @@ -1187,6 +1164,9 @@ def forward( more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ PIX2STRUCT_INPUTS_DOCSTRING = r""" @@ -1293,7 +1273,10 @@ def __init__(self, config): self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) self.layer = nn.ModuleList( - [Pix2StructTextBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + [ + Pix2StructTextBlock(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) + for i in range(config.num_layers) + ] ) self.final_layer_norm = Pix2StructLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -1364,6 +1347,7 @@ def forward( output_hidden_states: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Union[Tuple[torch.FloatTensor, ...], CausalLMOutputWithCrossAttentions]: r""" @@ -1405,24 +1389,54 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if use_cache or past_key_values is not None: + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + + past_key_values_length = 0 + if cache_position is not None: + past_key_values_length = cache_position[0] + elif past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - if encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # required mask seq length can be calculated via length of past + mask_seq_length = ( + past_key_values.get_seq_length() + seq_length if past_key_values is not None else seq_length ) + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.layer) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + else: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -1438,7 +1452,6 @@ def forward( # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions) else None @@ -1447,7 +1460,7 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.layer, past_key_values)): + for i, layer_module in enumerate(self.layer): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] if output_hidden_states: @@ -1462,7 +1475,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -1472,20 +1485,22 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) # layer_outputs is a tuple with: @@ -1493,7 +1508,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -1501,9 +1516,6 @@ def forward( position_bias = layer_outputs[2] if encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -1527,13 +1539,19 @@ def forward( loss = loss_fct(logits.contiguous().view(-1, logits.size(-1)), labels.contiguous().view(-1)) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ loss, logits, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1543,12 +1561,135 @@ def forward( return CausalLMOutputWithCrossAttentions( loss=loss, logits=logits, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + @add_start_docstrings( "A conditional generation model with a language modeling head. Can be used for sequence generation tasks.", @@ -1615,6 +1756,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: @@ -1723,6 +1865,7 @@ def forward( output_hidden_states=output_hidden_states, labels=labels, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index d6f92e9fe03495..6a64a27e007b3e 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -25,7 +25,9 @@ from transformers.generation import GenerationConfig from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -37,6 +39,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -136,6 +139,9 @@ more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ @@ -245,7 +251,12 @@ def forward(self, hidden_states): # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Pop2Piano,t5->pop2piano class Pop2PianoAttention(nn.Module): - def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False): + def __init__( + self, + config: Pop2PianoConfig, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -256,6 +267,13 @@ def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -332,11 +350,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -360,94 +381,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -457,22 +456,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -481,9 +480,11 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->Pop2Piano,t5->pop2piano class Pop2PianoLayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = Pop2PianoAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.SelfAttention = Pop2PianoAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -496,6 +497,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -506,6 +508,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -514,9 +517,9 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->Pop2Piano,t5->pop2piano class Pop2PianoLayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False) + self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -531,6 +534,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -543,6 +547,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -551,13 +556,17 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5Block with T5->Pop2Piano,t5->pop2piano class Pop2PianoBlock(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() - self.layer.append(Pop2PianoLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append( + Pop2PianoLayerSelfAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) + ) if self.is_decoder: - self.layer.append(Pop2PianoLayerCrossAttention(config)) + self.layer.append(Pop2PianoLayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(Pop2PianoLayerFF(config)) @@ -575,34 +584,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -616,25 +610,18 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: @@ -645,10 +632,6 @@ def forward( ) hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -667,11 +650,11 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + return outputs # hidden-states, past_key_value, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) class Pop2PianoPreTrainedModel(PreTrainedModel): @@ -684,6 +667,8 @@ class Pop2PianoPreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" is_parallelizable = False supports_gradient_checkpointing = True + _supports_cache_class = True + _supports_static_cache = False _no_split_modules = ["Pop2PianoBlock"] _keep_in_fp32_modules = ["wo"] @@ -769,7 +754,10 @@ def __init__(self, config, embed_tokens=None): self.is_decoder = config.is_decoder self.block = nn.ModuleList( - [Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + [ + Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) + for i in range(config.num_layers) + ] ) self.final_layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -803,6 +791,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -825,6 +814,13 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: if self.embed_tokens is None: raise ValueError("You have to initialize the model with valid token embeddings") @@ -832,28 +828,55 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: if not self.is_decoder: raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device ) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + else: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -866,17 +889,9 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None @@ -885,7 +900,7 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] if output_hidden_states: @@ -895,7 +910,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -905,20 +920,22 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) # layer_outputs is a tuple with: @@ -926,7 +943,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -934,9 +951,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -950,12 +964,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -964,12 +984,135 @@ def forward( ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + class Pop2PianoConcatEmbeddingToMel(nn.Module): """Embedding Matrix for `composer` tokens.""" @@ -1122,6 +1265,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1177,6 +1321,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index c39e85bacdd3d1..b150b04eea57b8 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -24,7 +24,9 @@ from torch.nn import CrossEntropyLoss from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( MoEModelOutput, MoEModelOutputWithPastAndCrossAttentions, @@ -39,6 +41,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -355,7 +358,12 @@ def forward(self, hidden_states, output_router_logits): # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->SwitchTransformers class SwitchTransformersAttention(nn.Module): - def __init__(self, config: SwitchTransformersConfig, has_relative_attention_bias=False): + def __init__( + self, + config: SwitchTransformersConfig, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -366,6 +374,13 @@ def __init__(self, config: SwitchTransformersConfig, has_relative_attention_bias self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -442,11 +457,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -470,94 +488,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -567,22 +563,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -591,10 +587,10 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->SwitchTransformers class SwitchTransformersLayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.SelfAttention = SwitchTransformersAttention( - config, has_relative_attention_bias=has_relative_attention_bias + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx ) self.layer_norm = SwitchTransformersLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -608,6 +604,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -618,6 +615,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -626,9 +624,11 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->SwitchTransformers class SwitchTransformersLayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = SwitchTransformersAttention(config, has_relative_attention_bias=False) + self.EncDecAttention = SwitchTransformersAttention( + config, has_relative_attention_bias=False, layer_idx=layer_idx + ) self.layer_norm = SwitchTransformersLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -643,6 +643,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -655,6 +656,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -662,16 +664,18 @@ def forward( class SwitchTransformersBlock(nn.Module): - def __init__(self, config, has_relative_attention_bias=False, is_sparse=False): + def __init__(self, config, has_relative_attention_bias=False, is_sparse=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.is_sparse = is_sparse self.layer = nn.ModuleList() self.layer.append( - SwitchTransformersLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + SwitchTransformersLayerSelfAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) ) if self.is_decoder: - self.layer.append(SwitchTransformersLayerCrossAttention(config)) + self.layer.append(SwitchTransformersLayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(SwitchTransformersLayerFF(config, is_sparse=self.is_sparse)) @@ -690,34 +694,19 @@ def forward( output_attentions=False, output_router_logits=True, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -727,35 +716,25 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -775,11 +754,11 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + (router_tuple,) + outputs = outputs + (past_key_value,) + attention_outputs + (router_tuple,) else: outputs = outputs + attention_outputs + (router_tuple,) - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights), (router_tuple) + return outputs # hidden-states, past_key_value, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights), (router_tuple) class SwitchTransformersPreTrainedModel(PreTrainedModel): @@ -791,6 +770,8 @@ class SwitchTransformersPreTrainedModel(PreTrainedModel): config_class = SwitchTransformersConfig base_model_prefix = "switch_transformers" supports_gradient_checkpointing = True + _supports_cache_class = True + _supports_static_cache = False _no_split_modules = ["SwitchTransformersBlock"] @property @@ -897,7 +878,9 @@ def __init__(self, config, embed_tokens=None): is_sparse = (i % sparse_step == 1 or sparse_step == 1) if sparse_step > 0 else False self.block.append( - SwitchTransformersBlock(config, has_relative_attention_bias=bool(i == 0), is_sparse=is_sparse) + SwitchTransformersBlock( + config, has_relative_attention_bias=bool(i == 0), is_sparse=is_sparse, layer_idx=i + ) ) self.final_layer_norm = SwitchTransformersLayerNorm(config.d_model, eps=config.layer_norm_epsilon) @@ -930,6 +913,7 @@ def forward( output_hidden_states=None, output_router_logits=True, return_dict=None, + cache_position=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -952,6 +936,13 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: if self.embed_tokens is None: raise ValueError("You have to initialize the model with valid token embeddings") @@ -959,28 +950,55 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: if not self.is_decoder: raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device ) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + else: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -993,17 +1011,9 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_router_probs = () if output_router_logits else None @@ -1013,7 +1023,7 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] @@ -1024,7 +1034,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -1034,21 +1044,26 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + output_router_logits, + return_dict, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_router_logits=output_router_logits, + return_dict=return_dict, + cache_position=cache_position, ) router_probs = layer_outputs[-1] @@ -1059,7 +1074,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -1067,9 +1082,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -1086,12 +1098,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1101,13 +1119,136 @@ def forward( ) return MoEModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, router_probs=all_router_probs, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + SWITCH_TRANSFORMERS_START_DOCSTRING = r""" @@ -1228,6 +1369,9 @@ def forward( should not be returned during inference. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ SWITCH_TRANSFORMERS_ENCODER_INPUTS_DOCSTRING = r""" @@ -1355,6 +1499,7 @@ def forward( output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqMoEModelOutput]: r""" Returns: @@ -1435,6 +1580,7 @@ def forward( output_hidden_states=output_hidden_states, output_router_logits=output_router_logits, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1535,6 +1681,7 @@ def forward( output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = True, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqMoEOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1618,6 +1765,7 @@ def forward( output_hidden_states=output_hidden_states, output_router_logits=output_router_logits, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py index e5f2615611b879..be6fbe9528d10a 100644 --- a/src/transformers/models/t5/configuration_t5.py +++ b/src/transformers/models/t5/configuration_t5.py @@ -73,7 +73,12 @@ class T5Config(PretrainedConfig): model_type = "t5" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + "head_dim": "d_kv", + } def __init__( self, diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 91596f013ab4f5..9012c8db9feb0a 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -25,7 +25,9 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -43,6 +45,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -339,7 +342,12 @@ def forward(self, hidden_states): class T5Attention(nn.Module): - def __init__(self, config: T5Config, has_relative_attention_bias=False): + def __init__( + self, + config: T5Config, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -350,6 +358,13 @@ def __init__(self, config: T5Config, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -426,11 +441,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -454,94 +472,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -551,22 +547,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -574,9 +570,11 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): class T5LayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.SelfAttention = T5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -589,6 +587,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -599,6 +598,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -606,9 +606,9 @@ def forward( class T5LayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) + self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -623,6 +623,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -635,6 +636,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -642,13 +644,15 @@ def forward( class T5Block(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() - self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append( + T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx) + ) if self.is_decoder: - self.layer.append(T5LayerCrossAttention(config)) + self.layer.append(T5LayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(T5LayerFF(config)) @@ -666,34 +670,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -707,25 +696,18 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: @@ -736,10 +718,6 @@ def forward( ) hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -758,11 +736,11 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + return outputs # hidden-states, past_key_value, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) class T5ClassificationHead(nn.Module): @@ -794,6 +772,9 @@ class T5PreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" is_parallelizable = True supports_gradient_checkpointing = True + _supports_quantized_cache = False # enc-dec models don't support yet + _supports_static_cache = True + _supports_cache_class = True _no_split_modules = ["T5Block"] _keep_in_fp32_modules = ["wo"] @@ -905,7 +886,7 @@ def __init__(self, config, embed_tokens=None): self.is_decoder = config.is_decoder self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + [T5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) for i in range(config.num_layers)] ) self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -981,6 +962,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): # Model parallel if self.model_parallel: @@ -1007,6 +989,13 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: if self.embed_tokens is None: raise ValueError("You have to initialize the model with valid token embeddings") @@ -1014,23 +1003,57 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: if not self.is_decoder: raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) - if attention_mask is None: + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + elif attention_mask is not None: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min + else: + causal_mask = None # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -1045,17 +1068,9 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None @@ -1064,15 +1079,15 @@ def forward( hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) # Ensure that attention_mask is always on the same device as hidden_states - if attention_mask is not None: - attention_mask = attention_mask.to(hidden_states.device) + if causal_mask is not None: + causal_mask = causal_mask.to(hidden_states.device) if position_bias is not None: position_bias = position_bias.to(hidden_states.device) if encoder_hidden_states is not None: @@ -1092,7 +1107,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, position_bias, encoder_hidden_states, encoder_extended_attention_mask, @@ -1102,20 +1117,24 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + return_dict, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, ) # layer_outputs is a tuple with: @@ -1123,7 +1142,7 @@ def forward( if use_cache is False: layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), @@ -1131,9 +1150,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[3],) @@ -1153,12 +1169,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1167,12 +1189,135 @@ def forward( ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + T5_START_DOCSTRING = r""" @@ -1286,6 +1431,9 @@ def forward( more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ T5_ENCODER_INPUTS_DOCSTRING = r""" @@ -1446,6 +1594,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: @@ -1525,6 +1674,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1656,6 +1806,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1750,6 +1901,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 6be8752d5b63b0..1928ac8a5c20c9 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -34,13 +34,16 @@ ) from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + is_torchdynamo_compiling, replace_return_docstrings, ) @@ -154,6 +157,9 @@ more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ @@ -411,6 +417,8 @@ class UdopPreTrainedModel(PreTrainedModel): config_class = UdopConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True + _supports_cache_class = True + _supports_static_cache = False _keep_in_fp32_modules = ["wo"] def _init_weights(self, module): @@ -598,7 +606,12 @@ def forward(self, hidden_states): # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Udop class UdopAttention(nn.Module): - def __init__(self, config: UdopConfig, has_relative_attention_bias=False): + def __init__( + self, + config: UdopConfig, + has_relative_attention_bias=False, + layer_idx: Optional[int] = None, + ): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -609,6 +622,13 @@ def __init__(self, config: UdopConfig, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -685,11 +705,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -713,94 +736,72 @@ def forward( query_length=None, use_cache=False, output_attentions=False, + cache_position=None, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) - # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] - real_seq_length = seq_length - - if past_key_value is not None: - if len(past_key_value) != 2: - raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - ) - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: - if key_value_states is None: - # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - - # get key/value states - key_states = project( - hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None - ) - value_states = project( - hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None - ) + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True - # compute scores - scores = torch.matmul( - query_states, key_states.transpose(3, 2) - ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -810,22 +811,22 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias_masked = position_bias scores += position_bias_masked - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( - scores - ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.dropout( - attn_weights, p=self.dropout, training=self.training - ) # (batch_size, n_heads, seq_length, key_length) + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -834,9 +835,11 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->Udop class UdopLayerSelfAttention(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = UdopAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.SelfAttention = UdopAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -849,6 +852,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -859,6 +863,7 @@ def forward( past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -867,9 +872,9 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->Udop class UdopLayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = UdopAttention(config, has_relative_attention_bias=False) + self.EncDecAttention = UdopAttention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -884,6 +889,7 @@ def forward( use_cache=False, query_length=None, output_attentions=False, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -896,6 +902,7 @@ def forward( use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -904,13 +911,17 @@ def forward( # Copied from transformers.models.t5.modeling_t5.T5Block with T5->Udop class UdopBlock(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() - self.layer.append(UdopLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append( + UdopLayerSelfAttention( + config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx + ) + ) if self.is_decoder: - self.layer.append(UdopLayerCrossAttention(config)) + self.layer.append(UdopLayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(UdopLayerFF(config)) @@ -928,34 +939,19 @@ def forward( use_cache=False, output_attentions=False, return_dict=True, + cache_position=None, ): - if past_key_value is not None: - if not self.is_decoder: - logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states, present_key_value_state = self_attention_outputs[:2] + hidden_states, past_key_value = self_attention_outputs[:2] attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training @@ -969,25 +965,18 @@ def forward( do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, + past_key_value=past_key_value, + query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, ) - hidden_states = cross_attention_outputs[0] + hidden_states, past_key_value = cross_attention_outputs[:2] # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: @@ -998,10 +987,6 @@ def forward( ) hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] @@ -1020,11 +1005,11 @@ def forward( outputs = (hidden_states,) if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs + outputs = outputs + (past_key_value,) + attention_outputs else: outputs = outputs + attention_outputs - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + return outputs # hidden-states, past_key_value, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) class UdopCellEmbeddings(nn.Module): @@ -1286,7 +1271,7 @@ def __init__(self, config, embed_tokens=None, embed_patches=None): self.num_layers = config.num_layers self.block = nn.ModuleList( - [UdopBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(self.num_layers)] + [UdopBlock(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) for i in range(self.num_layers)] ) self.final_layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon) @@ -1338,6 +1323,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1399,26 +1385,54 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device) - if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device ) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.config.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + else: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min if self.is_decoder and encoder_attention_mask is not None: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) @@ -1427,7 +1441,6 @@ def forward( # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None @@ -1436,34 +1449,35 @@ def forward( position_bias = None else: position_bias = self.relative_bias(attention_mask=attention_mask, bbox=bbox) - position_bias = position_bias + extended_attention_mask + position_bias = position_bias + causal_mask encoder_decoder_position_bias = None hidden_states = inputs_embeds hidden_states = self.dropout(hidden_states) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=head_mask[i], - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) if use_cache is False: # MP fixes layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] - hidden_states, present_key_value_state = layer_outputs[:2] + hidden_states, next_decoder_cache = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention weights), @@ -1472,9 +1486,6 @@ def forward( position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] - # append next layer key value states - if use_cache: - present_key_value_states = present_key_value_states + (present_key_value_state,) if output_attentions: all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now @@ -1488,13 +1499,19 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, attention_mask, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -1505,12 +1522,135 @@ def forward( return BaseModelOutputWithAttentionMask( last_hidden_state=hidden_states, attention_mask=attention_mask, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + @add_start_docstrings( "The bare UDOP encoder-decoder Transformer outputting raw hidden-states without any specific head on top.", @@ -1584,6 +1724,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[Tensor, ...]: r""" Returns: @@ -1653,6 +1794,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1759,6 +1901,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, labels: Optional[Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[Tensor, ...]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1837,6 +1980,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py index d7323d759fd086..ba8ea0460ba071 100644 --- a/src/transformers/models/umt5/configuration_umt5.py +++ b/src/transformers/models/umt5/configuration_umt5.py @@ -72,7 +72,12 @@ class UMT5Config(PretrainedConfig): model_type = "umt5" keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"} + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + "head_dim": "d_kv", + } def __init__( self, diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index bd621fc2fb3ac2..985dc5e4426dff 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -23,7 +23,9 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -40,6 +42,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_fx_proxy, + is_torchdynamo_compiling, logging, replace_return_docstrings, ) @@ -155,7 +158,7 @@ class UMT5Attention(nn.Module): T5's attention using relative_attention_bias. """ - def __init__(self, config, has_relative_attention_bias=False): + def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -166,6 +169,13 @@ def __init__(self, config, has_relative_attention_bias=False): self.n_heads = config.num_heads self.dropout = config.dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) @@ -230,11 +240,14 @@ def _relative_position_bucket(self, relative_position): relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None] memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket(relative_position) @@ -249,78 +262,95 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, ): - is_cross_attention = encoder_hidden_states is not None batch_size, seq_length = hidden_states.shape[:2] - # use encoder_hidden_states if cross attention - current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states - # checking that the `sequence_length` of the `past_key_value` is the same as the he provided - # `encoder_hidden_states` to support prefix tuning - if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: + # if encoder_hidden_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = encoder_hidden_states is not None + + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] else: - key_states = self._shape(self.k(current_states)) - value_states = self._shape(self.v(current_states)) - if past_key_value is not None and not is_cross_attention: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - query_states = self._shape(self.q(hidden_states)) - attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - # compute positional bias - if self.has_relative_attention_bias: - query_length = seq_length if past_key_value is not None: - query_length += past_key_value[0].shape[2] - position_bias = self.compute_bias(query_length, key_states.size(2), device=attention_scores.device) - else: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True + + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + scores = torch.matmul(query_states, key_states.transpose(3, 2)) + + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = seq_length + past_key_value.get_seq_length() if past_key_value is not None else seq_length + key_length = key_states.shape[-2] + if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, seq_length, key_states.size(2)), - device=attention_scores.device, - dtype=attention_scores.dtype, - requires_grad=self.training, + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] + if attention_mask is not None: - position_bias = position_bias + attention_mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + scores += position_bias_masked - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - attention_scores += position_bias # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask - # attn_output = torch.bmm(attn_probs, value_states) ? - context_states = torch.matmul(attn_weights, value_states) - # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ? - context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1) - attn_output = self.o(context_states) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, seq_length, -1) + + attn_output = self.o(attn_output) return attn_output, attn_weights, past_key_value class UMT5LayerSelfAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.SelfAttention = UMT5Attention(config, has_relative_attention_bias=True) + self.SelfAttention = UMT5Attention(config, has_relative_attention_bias=True, layer_idx=layer_idx) self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -330,6 +360,7 @@ def forward( attention_mask=None, layer_head_mask=None, past_key_value=None, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( @@ -337,6 +368,7 @@ def forward( attention_mask=attention_mask, layer_head_mask=layer_head_mask, past_key_value=past_key_value, + cache_position=cache_position, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them @@ -344,9 +376,9 @@ def forward( class UMT5LayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() - self.EncDecAttention = UMT5Attention(config, has_relative_attention_bias=False) + self.EncDecAttention = UMT5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx) self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -357,6 +389,7 @@ def forward( attention_mask=None, layer_head_mask=None, past_key_value=None, + cache_position=None, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( @@ -365,6 +398,7 @@ def forward( attention_mask=attention_mask, layer_head_mask=layer_head_mask, past_key_value=past_key_value, + cache_position=cache_position, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -372,13 +406,13 @@ def forward( class UMT5Block(nn.Module): - def __init__(self, config): + def __init__(self, config, layer_idx: Optional[int] = None): super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() - self.layer.append(UMT5LayerSelfAttention(config)) + self.layer.append(UMT5LayerSelfAttention(config, layer_idx=layer_idx)) if self.is_decoder: - self.layer.append(UMT5LayerCrossAttention(config)) + self.layer.append(UMT5LayerCrossAttention(config, layer_idx=layer_idx)) self.layer.append(UMT5LayerFF(config)) @@ -393,16 +427,14 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, + cache_position=None, ): - # Self Attention - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None - - hidden_states, self_attn_weights, present_key_value = self.layer[0]( + hidden_states, self_attn_weights, past_key_value = self.layer[0]( hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, + past_key_value=past_key_value, + cache_position=cache_position, ) # clamp inf values to enable fp16 training @@ -412,18 +444,16 @@ def forward( hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) # Cross-Attention Block - cross_attn_present_key_value = None cross_attn_weights = None do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: - # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple - cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None - hidden_states, cross_attn_weights, cross_attn_present_key_value = self.layer[1]( + hidden_states, cross_attn_weights, past_key_value = self.layer[1]( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=encoder_attention_mask, layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, + past_key_value=past_key_value, + cache_position=cache_position, ) # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16: @@ -431,8 +461,6 @@ def forward( clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype) hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - present_key_value += cross_attn_present_key_value - # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) @@ -444,7 +472,7 @@ def forward( outputs = ( hidden_states, - present_key_value, + past_key_value, ) if output_attentions: @@ -481,6 +509,8 @@ class UMT5PreTrainedModel(PreTrainedModel): config_class = UMT5Config base_model_prefix = "transformer" supports_gradient_checkpointing = True + _supports_cache_class = True + _supports_static_cache = True _no_split_modules = ["UMT5Block"] _keep_in_fp32_modules = ["wo"] @@ -594,7 +624,7 @@ def __init__(self, config, embed_tokens=None): super().__init__(config) self.embed_tokens = embed_tokens self.is_decoder = config.is_decoder - self.block = nn.ModuleList([UMT5Block(config) for i in range(config.num_layers)]) + self.block = nn.ModuleList([UMT5Block(config, layer_idx=i) for i in range(config.num_layers)]) self.final_layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -622,6 +652,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, + cache_position=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -644,6 +675,13 @@ def forward( err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + if inputs_embeds is None: if self.embed_tokens is None: raise ValueError("You have to initialize the model with valid token embeddings") @@ -651,28 +689,57 @@ def forward( batch_size, seq_length = input_shape - # required mask seq length can be calculated via length of past - mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length - if use_cache is True: if not self.is_decoder: raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # initialize past_key_values + return_legacy_cache = False + return_self_attention_cache = False + if self.is_decoder and (use_cache or past_key_values is not None): + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + elif past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache()) + elif not self.is_decoder: + # do not pass cache object down the line for encoder stack + # it messes indexing later in decoder-stack because cache object is modified in-place + past_key_values = None + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device ) - # initialize past_key_values with `None` if past does not exist - if past_key_values is None: - past_key_values = [None] * len(self.block) + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + if self.is_decoder: + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + elif attention_mask is not None: + causal_mask = attention_mask[:, None, None, :] + causal_mask = causal_mask.to(dtype=inputs_embeds.dtype) + causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min + else: + causal_mask = None # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -685,24 +752,16 @@ def forward( else: encoder_extended_attention_mask = None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) - present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.is_decoder else None hidden_states = self.dropout(inputs_embeds) - for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + for i, layer_module in enumerate(self.block): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] @@ -713,7 +772,7 @@ def forward( layer_outputs = self._gradient_checkpointing_func( layer_module.forward, hidden_states, - extended_attention_mask, + causal_mask, encoder_hidden_states, encoder_extended_attention_mask, layer_head_mask, @@ -721,24 +780,26 @@ def forward( None, # past_key_value is always None with gradient checkpointing use_cache, output_attentions, + cache_position, ) else: layer_outputs = layer_module( hidden_states, - attention_mask=extended_attention_mask, + attention_mask=causal_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, - past_key_value=past_key_value, + past_key_value=past_key_values, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) hidden_states = layer_outputs[0] if use_cache: - present_key_value_states += (layer_outputs[1],) + next_decoder_cache = layer_outputs[1] if output_attentions: all_attentions += (layer_outputs[2],) @@ -752,12 +813,18 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: return tuple( v for v in [ hidden_states, - present_key_value_states, + next_cache, all_hidden_states, all_attentions, all_cross_attentions, @@ -766,12 +833,135 @@ def forward( ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=present_key_value_states, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + UMT5_START_DOCSTRING = r""" @@ -885,6 +1075,9 @@ def forward( more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. """ UMT5_ENCODER_INPUTS_DOCSTRING = r""" @@ -1022,6 +1215,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: @@ -1084,6 +1278,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) if not return_dict: @@ -1197,6 +1392,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1268,6 +1464,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) sequence_output = decoder_outputs[0] diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py index c0cf21b2369d0a..a9d3e7479e9578 100644 --- a/tests/models/longt5/test_modeling_longt5.py +++ b/tests/models/longt5/test_modeling_longt5.py @@ -31,6 +31,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( MODEL_FOR_QUESTION_ANSWERING_MAPPING, @@ -574,6 +575,41 @@ def test_decoder_model_past_with_3d_attn_mask(self): lm_labels, ) + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) @@ -602,7 +638,7 @@ def test_export_to_onnx(self): (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]), f"{tmpdirname}/longt5_test.onnx", export_params=True, - opset_version=13, + opset_version=14, input_names=["input_ids", "decoder_input_ids"], ) diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 6e912ec3607d40..20412da2e1db06 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -40,6 +40,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( AutoModelForSeq2SeqLM, @@ -575,6 +576,9 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small MT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] + # used in `test_torch_compile` + _torch_compile_test_ckpt = "google/mt5-small" + def setUp(self): self.model_tester = MT5ModelTester(self) self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37) @@ -627,12 +631,9 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa ] if labels is not None: input_names.append("labels") - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} input_names = list(filtered_inputs.keys()) - model_output = model(**filtered_inputs) - traced_model = symbolic_trace(model, input_names) traced_output = traced_model(**filtered_inputs) else: @@ -647,7 +648,6 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa "visual_feats", "visual_pos", ] - labels = inputs.get("labels", None) start_positions = inputs.get("start_positions", None) end_positions = inputs.get("end_positions", None) @@ -657,15 +657,12 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa input_names.append("start_positions") if end_positions is not None: input_names.append("end_positions") - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} input_names = list(filtered_inputs.keys()) - if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and ( not hasattr(model.config, "problem_type") or model.config.problem_type is None ): model.config.problem_type = "single_label_classification" - traced_model = symbolic_trace(model, input_names) traced_output = traced_model(**filtered_inputs) model_output = model(**filtered_inputs) @@ -718,6 +715,41 @@ def flatten_output(output): # (Even with this call, there are still memory leak by ~0.04MB) self.clear_torch_jit_class_registry() + # overwrite because MT5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py index 3a33b5a98128e2..39ff67f08ce5a9 100644 --- a/tests/models/pop2piano/test_modeling_pop2piano.py +++ b/tests/models/pop2piano/test_modeling_pop2piano.py @@ -620,7 +620,7 @@ def test_export_to_onnx(self): (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]), f"{tmpdirname}/Pop2Piano_test.onnx", export_params=True, - opset_version=9, + opset_version=14, input_names=["input_ids", "decoder_input_ids"], ) diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index 13215b2826fe0c..7adb1f40c6e696 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -36,6 +36,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( AutoTokenizer, @@ -645,6 +646,41 @@ def test_decoder_model_past_with_3d_attn_mask(self): lm_labels, ) + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index fe9b40a54abef7..68dd5a52b3d69b 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -27,6 +27,7 @@ require_sentencepiece, require_tokenizers, require_torch, + require_torch_gpu, slow, torch_device, ) @@ -44,6 +45,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( AutoTokenizer, @@ -578,6 +580,9 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small T5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] + # used in `test_torch_compile` + _torch_compile_test_ckpt = "google-t5/t5-small" + def setUp(self): self.model_tester = T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) @@ -630,12 +635,9 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa ] if labels is not None: input_names.append("labels") - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} input_names = list(filtered_inputs.keys()) - model_output = model(**filtered_inputs) - traced_model = symbolic_trace(model, input_names) traced_output = traced_model(**filtered_inputs) else: @@ -650,7 +652,6 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa "visual_feats", "visual_pos", ] - labels = inputs.get("labels", None) start_positions = inputs.get("start_positions", None) end_positions = inputs.get("end_positions", None) @@ -660,15 +661,12 @@ def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=Fa input_names.append("start_positions") if end_positions is not None: input_names.append("end_positions") - filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} input_names = list(filtered_inputs.keys()) - if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and ( not hasattr(model.config, "problem_type") or model.config.problem_type is None ): model.config.problem_type = "single_label_classification" - traced_model = symbolic_trace(model, input_names) traced_output = traced_model(**filtered_inputs) model_output = model(**filtered_inputs) @@ -721,6 +719,41 @@ def flatten_output(output): # (Even with this call, there are still memory leak by ~0.04MB) self.clear_torch_jit_class_registry() + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + def test_config(self): self.config_tester.run_common_tests() @@ -1482,6 +1515,7 @@ def test_summarization(self): [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]], padding="max_length", truncation=True, + max_length=512, return_tensors="pt", ).to(torch_device) self.assertEqual(512, dct["input_ids"].shape[1]) @@ -1604,14 +1638,76 @@ def test_contrastive_search_t5(self): outputs = t5_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64) generated_text = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True) + # TODO: @arthur? + # PR #31938 caused regression on this test which was fixed by PR #34089 self.assertListEqual( generated_text, [ - "Liana Barrientos has been married 10 times, nine of them in the Bronx. Her husbands filed for " - "permanent residence after the marriages, prosecutors say." + "Liana Barrientos has been married 10 times, nine of them in the Bronx . Her husbands filed for " + "permanent residence after the marriages, prosecutors say ." ], ) + @slow + @require_torch_gpu + def test_compile_static_cache(self): + NUM_TOKENS_TO_GENERATE = 40 + EXPECTED_TEXT_COMPLETION = [ + "theory of relativity states that 1) the speed of light is constant in all inertial reference frames. the laws of physics are the same for all inertial reference frames.", + "ketchup is my favorite condiment.", + ] + + prompts = [ + "summarize: Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial " + "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe " + "theory of relativity is not hard to grasp.", + "summarize: My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, " + "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my pizza.", + ] + model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(torch_device) + tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text) + + # Static Cache + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text) + + # Static Cache + compile + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) + + @slow + @require_torch_gpu + def test_compile_static_cache_encoder(self): + prompts = [ + "summarize: Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial " + "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe " + "theory of relativity is not hard to grasp.", + "summarize: My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, " + "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my pizza.", + ] + model = T5EncoderModel.from_pretrained("google-t5/t5-small").to(torch_device) + tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + logits = model(**inputs) + + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + logits_compiled = model(**inputs) + self.assertTrue(torch.allclose(logits[0][:, -3:, -3], logits_compiled[0][:, -3:, -3], atol=1e-5)) + @require_torch class TestAsymmetricT5(unittest.TestCase): diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index a3ae498606a379..9d82173b1aed6c 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -37,6 +37,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor @@ -348,6 +349,7 @@ def test_forward_signature(self): expected_arg_names = [ "attention_mask", "bbox", + "cache_position", "cross_attn_head_mask", "decoder_attention_mask", "decoder_head_mask", @@ -365,6 +367,43 @@ def test_forward_signature(self): expected_arg_names = sorted(expected_arg_names) self.assertListEqual(sorted(arg_names[: len(expected_arg_names)]), expected_arg_names) + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + bbox=input_dict["bbox"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + bbox=input_dict["bbox"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + @unittest.skip( "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!" ) @@ -534,6 +573,41 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + @unittest.skip( "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!" ) diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index 1bd01da8e6caec..ec4c1d019b6d17 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -41,6 +41,7 @@ if is_torch_available(): import torch + import torch.nn.functional as F from transformers import ( AutoTokenizer, @@ -316,6 +317,9 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin # The small UMT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] + # used in `test_torch_compile` + _torch_compile_test_ckpt = "google/umt5-small" + def setUp(self): self.model_tester = UMT5ModelTester(self) @@ -486,6 +490,41 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs)[0] + # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids` + def test_custom_4d_attention_mask(self): + for model_class in self.all_generative_model_classes: + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + _, + input_ids_shared_prefix, + mask_shared_prefix, + _, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward( + decoder_input_ids=input_ids, + input_ids=input_dict["input_ids"][:3], + ).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids=input_dict["input_ids"][:1], + decoder_input_ids=input_ids_shared_prefix, + decoder_attention_mask=mask_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = F.softmax(out_last_tokens) + normalized_1 = F.softmax(out_shared_prefix_last_tokens) + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) + def test_with_sequence_classification_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index dec1482f562a33..964b7b912b4e0f 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -37,6 +37,7 @@ from transformers import ( AutoModel, AutoModelForCausalLM, + AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig, @@ -5109,10 +5110,15 @@ def test_torch_compile(self): batch_size = 1 n_iter = 3 - tokenizer = AutoTokenizer.from_pretrained(ckpt, revision=revision) - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) + tokenizer = AutoTokenizer.from_pretrained(ckpt) + if self.is_encoder_decoder: + model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + else: + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) model.generation_config.max_new_tokens = 4 @@ -5184,10 +5190,15 @@ def test_compile_cuda_graph_time(self): os.environ["TOKENIZERS_PARALLELISM"] = "false" - tokenizer = AutoTokenizer.from_pretrained(ckpt, revision=revision) - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) + tokenizer = AutoTokenizer.from_pretrained(ckpt) + if self.is_encoder_decoder: + model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + else: + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) cache_implementation = "static" if model.config.model_type == "gemma2": From b644178ed4275e7d44cd0bc29a5a3319cb51bc0d Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 22 Oct 2024 02:03:25 -0700 Subject: [PATCH 062/385] [docs] Fix GenerationConfig params (#34299) fix generationconfigs --- src/transformers/generation/configuration_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 1acd40641132b3..37d57248c46a17 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -93,7 +93,7 @@ class GenerationMode(ExplicitEnum): class GenerationConfig(PushToHubMixin): # no-format - rf""" + """ Class that holds a configuration for a generation task. A `generate` call supports the following generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models: From 93352e81f5019abaa52f7bdc2e3284779e864367 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:05:56 +0200 Subject: [PATCH 063/385] Fix Korean doc _toctree.yml (#34293) Fix korean doc _toctree.yml --- docs/source/ko/_toctree.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 883db54c7a3cd7..351f89c7891d59 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -673,7 +673,7 @@ - local: in_translation title: (번역중) XLSR-Wav2Vec2 title: (번역중) 오디오 모델 - - isExpanded: false + - isExpanded: false sections: - local: model_doc/vivit title: ViViT From 681fc43713efb34dabf113abf8de51a32e19f539 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 22 Oct 2024 13:33:49 +0100 Subject: [PATCH 064/385] Sync video classification pipeline with huggingface_hub spec (#34288) * Sync video classification pipeline * Add disclaimer --- .../pipelines/video_classification.py | 54 ++++++++++++++++--- .../test_pipelines_video_classification.py | 11 +++- tests/test_pipeline_mixin.py | 3 ++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py index 68ea928bce5672..057910098da20a 100644 --- a/src/transformers/pipelines/video_classification.py +++ b/src/transformers/pipelines/video_classification.py @@ -1,3 +1,17 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings from io import BytesIO from typing import List, Union @@ -42,7 +56,7 @@ def __init__(self, *args, **kwargs): requires_backends(self, "av") self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES) - def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None): + def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None): preprocess_params = {} if frame_sampling_rate is not None: preprocess_params["frame_sampling_rate"] = frame_sampling_rate @@ -52,14 +66,23 @@ def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate= postprocess_params = {} if top_k is not None: postprocess_params["top_k"] = top_k + if function_to_apply is not None: + if function_to_apply not in ["softmax", "sigmoid", "none"]: + raise ValueError( + f"Invalid value for `function_to_apply`: {function_to_apply}. " + "Valid options are ['softmax', 'sigmoid', 'none']" + ) + postprocess_params["function_to_apply"] = function_to_apply + else: + postprocess_params["function_to_apply"] = "softmax" return preprocess_params, {}, postprocess_params - def __call__(self, videos: Union[str, List[str]], **kwargs): + def __call__(self, inputs: Union[str, List[str]] = None, **kwargs): """ Assign labels to the video(s) passed as inputs. Args: - videos (`str`, `List[str]`): + inputs (`str`, `List[str]`): The pipeline handles three types of videos: - A string containing a http link pointing to a video @@ -76,6 +99,11 @@ def __call__(self, videos: Union[str, List[str]], **kwargs): frame_sampling_rate (`int`, *optional*, defaults to 1): The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every frame will be used. + function_to_apply(`str`, *optional*, defaults to "softmax"): + The function to apply to the model output. By default, the pipeline will apply the softmax function to + the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's + built-in `None` will default to "softmax", so you need to pass the string "none" to disable any + post-processing. Return: A dictionary or a list of dictionaries containing result. If the input is a single video, will return a @@ -87,7 +115,16 @@ def __call__(self, videos: Union[str, List[str]], **kwargs): - **label** (`str`) -- The label identified by the model. - **score** (`int`) -- The score attributed by the model for that label. """ - return super().__call__(videos, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "videos" in kwargs: + warnings.warn( + "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted", + FutureWarning, + ) + inputs = kwargs.pop("videos") + if inputs is None: + raise ValueError("Cannot call the video-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) def preprocess(self, video, num_frames=None, frame_sampling_rate=1): if num_frames is None: @@ -114,12 +151,17 @@ def _forward(self, model_inputs): model_outputs = self.model(**model_inputs) return model_outputs - def postprocess(self, model_outputs, top_k=5): + def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"): if top_k > self.model.config.num_labels: top_k = self.model.config.num_labels if self.framework == "pt": - probs = model_outputs.logits.softmax(-1)[0] + if function_to_apply == "softmax": + probs = model_outputs.logits[0].softmax(-1) + elif function_to_apply == "sigmoid": + probs = model_outputs.logits[0].sigmoid() + else: + probs = model_outputs.logits[0] scores, ids = probs.topk(top_k) else: raise ValueError(f"Unsupported framework: {self.framework}") diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py index 8b910e94af3b47..f1ed97ac13df1a 100644 --- a/tests/pipelines/test_pipelines_video_classification.py +++ b/tests/pipelines/test_pipelines_video_classification.py @@ -14,11 +14,12 @@ import unittest -from huggingface_hub import hf_hub_download +from huggingface_hub import VideoClassificationOutputElement, hf_hub_download from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor from transformers.pipelines import VideoClassificationPipeline, pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_av, @@ -76,6 +77,8 @@ def run_pipeline_test(self, video_classifier, examples): {"score": ANY(float), "label": ANY(str)}, ], ) + for element in outputs: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) @require_torch def test_small_model_pt(self): @@ -93,6 +96,9 @@ def test_small_model_pt(self): nested_simplify(outputs, decimals=4), [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}], ) + for output in outputs: + for element in output: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) outputs = video_classifier( [ @@ -108,6 +114,9 @@ def test_small_model_pt(self): [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}], ], ) + for output in outputs: + for element in output: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) @require_tf @unittest.skip diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index 74bc1b8669a702..fe8a197237291a 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -34,6 +34,7 @@ ImageToTextInput, ObjectDetectionInput, QuestionAnsweringInput, + VideoClassificationInput, ZeroShotImageClassificationInput, ) @@ -47,6 +48,7 @@ ImageToTextPipeline, ObjectDetectionPipeline, QuestionAnsweringPipeline, + VideoClassificationPipeline, ZeroShotImageClassificationPipeline, ) from transformers.testing_utils import ( @@ -132,6 +134,7 @@ "image-to-text": (ImageToTextPipeline, ImageToTextInput), "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput), "question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput), + "video-classification": (VideoClassificationPipeline, VideoClassificationInput), "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput), } From 84b17e03f1e248cdb73ddb2360929e460cb84797 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 22 Oct 2024 15:11:54 +0200 Subject: [PATCH 065/385] Update PR templates (#34065) update PR template --- .github/ISSUE_TEMPLATE/bug-report.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index ea7d6a02252cf5..ecc795ae63253e 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -55,7 +55,7 @@ body: - deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc - - quantization (bitsandbytes, autogpt): @SunMarc + - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber Documentation: @stevhliu diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 417f5a2e45b58c..ee7a7eaae1139b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -59,7 +59,7 @@ Integrations: - deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc -- quantization (bitsandbytes, autogpt): @SunMarc +- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber Documentation: @stevhliu From eb6a734995aa2586528e29fd9d0a13a55721444b Mon Sep 17 00:00:00 2001 From: HALLOUARD <57447861+YHallouard@users.noreply.github.com> Date: Tue, 22 Oct 2024 15:14:07 +0200 Subject: [PATCH 066/385] [RT-DETR] Fix onnx inference bug for Optype (Where) (#33877) * feat: [RT-DETR] Add onnx runtime config and fix onnx inference bug Optype (Where) * fix lint * use dtype istead of torch.float32 * add doc * remove onnx config * use dtype info * use tensor to fix lint --- src/transformers/models/rt_detr/modeling_rt_detr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 21644c4a869a0a..1c09025a34b140 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -1752,7 +1752,7 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dt anchors = torch.concat(anchors, 1) valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) anchors = torch.log(anchors / (1 - anchors)) - anchors = torch.where(valid_mask, anchors, torch.inf) + anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device)) return anchors, valid_mask From 51e395d13e46a8ecdda2b47381519bdfca87ba4a Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 22 Oct 2024 15:37:21 +0200 Subject: [PATCH 067/385] Fix FA2 attention for models supporting sliding window (#34093) Fix FA2 --- .../models/jamba/modeling_jamba.py | 27 ------------------- .../models/mistral/modeling_mistral.py | 25 ----------------- .../models/mixtral/modeling_mixtral.py | 25 ----------------- src/transformers/models/phi3/modeling_phi3.py | 25 ----------------- .../models/phimoe/modeling_phimoe.py | 25 ----------------- .../models/qwen2/modeling_qwen2.py | 26 ------------------ .../models/qwen2_moe/modeling_qwen2_moe.py | 26 ------------------ .../models/qwen2_vl/modeling_qwen2_vl.py | 25 ----------------- .../models/starcoder2/modeling_starcoder2.py | 26 ------------------ 9 files changed, 230 deletions(-) diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 818c6acb3f7961..737be17cfc1694 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -417,34 +417,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - kv_seq_len = cache_position[-1] - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = cache_position[0] > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) # repeat k/v heads if n_kv_heads < n_heads diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 82d087b23cdd2a..ef225e15988237 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -320,31 +320,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 7bf7e3ccd7ca28..3ff851b45ea161 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -431,31 +431,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index d0865065db1882..16601e1f9957d5 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -492,31 +492,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index e96eae799cda88..559daeca694dbe 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -369,31 +369,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 1941bca17add08..8bd552e66ecbe1 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -394,32 +394,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - kv_seq_len = key_states.shape[-2] + cache_position[0] - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index efeb13f90287ba..60cd5e4722857e 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -481,32 +481,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - kv_seq_len = key_states.shape[-2] + cache_position[0] - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 07531248f63b1d..4e9401c77e4d7d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -673,31 +673,6 @@ def forward( ) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 1779337b1a0093..b81dac38c7ea55 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -373,32 +373,6 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - kv_seq_len = key_states.shape[-2] + cache_position[0] - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) From 104599d7a84ae54d954b4fcbed0eaa7d3bc43c86 Mon Sep 17 00:00:00 2001 From: pbelcak <107618858+pbelcak@users.noreply.github.com> Date: Tue, 22 Oct 2024 06:49:21 -0700 Subject: [PATCH 068/385] Fix: tensor of examples of the same length triggers invalid stacking (#34166) * Fix issue where tensor of examples of the same length triggers invalid stacking * Update data_collator.py --- src/transformers/data/data_collator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 696cedf47d98a0..cc80f6a19bfb26 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -439,7 +439,8 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): - return torch.stack(examples, dim=0) + if not isinstance(examples, torch.Tensor): + return torch.stack(examples, dim=0) # If yes, check if we have a `pad_token`. if tokenizer._pad_token is None: From c31a6ff474edfb59800024d9b54495f6e398c875 Mon Sep 17 00:00:00 2001 From: Alexandros Benetatos <34627055+alex-bene@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:50:54 +0300 Subject: [PATCH 069/385] Add post_process_depth_estimation to image processors and support ZoeDepth's inference intricacies (#32550) * add colorize_depth and matplotlib availability check * add post_process_depth_estimation for zoedepth + tests * add post_process_depth_estimation for DPT + tests * add post_process_depth_estimation in DepthEstimationPipeline & special case for zoedepth * run `make fixup` * fix import related error on tests * fix more import related errors on test * forgot some `torch` calls in declerations * remove `torch` call in zoedepth tests that caused error * updated docs for depth estimation * small fix for `colorize` input/output types * remove `colorize_depth`, fix various names, remove matplotlib dependency * fix formatting * run fixup * different images for test * update examples in `forward` functions * fixed broken links * fix output types for docs * possible format fix inside `` * Readability related updates Co-authored-by: Pavel Iakubovskii * Readability related update * cleanup after merge * refactor `post_process_depth_estimation` to return dict; simplify ZoeDepth's `post_process_depth_estimation` * rewrite dict merging to support python 3.8 --------- Co-authored-by: Pavel Iakubovskii --- docs/source/en/model_doc/depth_anything.md | 23 ++- docs/source/en/model_doc/depth_anything_v2.md | 23 ++- docs/source/en/model_doc/zoedepth.md | 90 ++++++----- .../en/tasks/monocular_depth_estimation.md | 111 +++----------- .../depth_anything/modeling_depth_anything.py | 16 +- .../models/dpt/image_processing_dpt.py | 55 ++++++- src/transformers/models/dpt/modeling_dpt.py | 16 +- .../zoedepth/image_processing_zoedepth.py | 144 ++++++++++++++++-- .../models/zoedepth/modeling_zoedepth.py | 16 +- .../pipelines/depth_estimation.py | 37 +++-- tests/models/dpt/test_modeling_dpt.py | 26 ++++ .../models/zoedepth/test_modeling_zoedepth.py | 99 ++++++++++++ .../test_pipelines_depth_estimation.py | 2 +- 13 files changed, 446 insertions(+), 212 deletions(-) diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md index e08e4bfc9904b7..7cdf72de5c8474 100644 --- a/docs/source/en/model_doc/depth_anything.md +++ b/docs/source/en/model_doc/depth_anything.md @@ -84,27 +84,24 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: >>> with torch.no_grad(): ... outputs = model(**inputs) -... predicted_depth = outputs.predicted_depth - ->>> # interpolate to original size ->>> prediction = torch.nn.functional.interpolate( -... predicted_depth.unsqueeze(1), -... size=image.size[::-1], -... mode="bicubic", -... align_corners=False, + +>>> # interpolate to original size and visualize the prediction +>>> post_processed_output = image_processor.post_process_depth_estimation( +... outputs, +... target_sizes=[(image.height, image.width)], ... ) ->>> # visualize the prediction ->>> output = prediction.squeeze().cpu().numpy() ->>> formatted = (output * 255 / np.max(output)).astype("uint8") ->>> depth = Image.fromarray(formatted) +>>> predicted_depth = post_processed_output[0]["predicted_depth"] +>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min()) +>>> depth = depth.detach().cpu().numpy() * 255 +>>> depth = Image.fromarray(depth.astype("uint8")) ``` ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything. -- [Monocular depth estimation task guide](../tasks/depth_estimation) +- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) - A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/depth_anything_v2.md b/docs/source/en/model_doc/depth_anything_v2.md index 49f655238efca6..c98017d2bbc510 100644 --- a/docs/source/en/model_doc/depth_anything_v2.md +++ b/docs/source/en/model_doc/depth_anything_v2.md @@ -78,27 +78,24 @@ If you want to do the pre- and post-processing yourself, here's how to do that: >>> with torch.no_grad(): ... outputs = model(**inputs) -... predicted_depth = outputs.predicted_depth - ->>> # interpolate to original size ->>> prediction = torch.nn.functional.interpolate( -... predicted_depth.unsqueeze(1), -... size=image.size[::-1], -... mode="bicubic", -... align_corners=False, + +>>> # interpolate to original size and visualize the prediction +>>> post_processed_output = image_processor.post_process_depth_estimation( +... outputs, +... target_sizes=[(image.height, image.width)], ... ) ->>> # visualize the prediction ->>> output = prediction.squeeze().cpu().numpy() ->>> formatted = (output * 255 / np.max(output)).astype("uint8") ->>> depth = Image.fromarray(formatted) +>>> predicted_depth = post_processed_output[0]["predicted_depth"] +>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min()) +>>> depth = depth.detach().cpu().numpy() * 255 +>>> depth = Image.fromarray(depth.astype("uint8")) ``` ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything. -- [Monocular depth estimation task guide](../tasks/depth_estimation) +- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) - [Depth Anything V2 demo](https://huggingface.co/spaces/depth-anything/Depth-Anything-V2). - A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎 - [Core ML conversion of the `small` variant for use on Apple Silicon](https://huggingface.co/apple/coreml-depth-anything-v2-small). diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md index d16da59ea98245..74e25f3c3f6ea5 100644 --- a/docs/source/en/model_doc/zoedepth.md +++ b/docs/source/en/model_doc/zoedepth.md @@ -39,54 +39,66 @@ The original code can be found [here](https://github.com/isl-org/ZoeDepth). The easiest to perform inference with ZoeDepth is by leveraging the [pipeline API](../main_classes/pipelines.md): ```python -from transformers import pipeline -from PIL import Image -import requests +>>> from transformers import pipeline +>>> from PIL import Image +>>> import requests -url = "http://images.cocodataset.org/val2017/000000039769.jpg" -image = Image.open(requests.get(url, stream=True).raw) +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) -pipe = pipeline(task="depth-estimation", model="Intel/zoedepth-nyu-kitti") -result = pipe(image) -depth = result["depth"] +>>> pipe = pipeline(task="depth-estimation", model="Intel/zoedepth-nyu-kitti") +>>> result = pipe(image) +>>> depth = result["depth"] ``` Alternatively, one can also perform inference using the classes: ```python -from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation -import torch -import numpy as np -from PIL import Image -import requests - -url = "http://images.cocodataset.org/val2017/000000039769.jpg" -image = Image.open(requests.get(url, stream=True).raw) - -image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti") -model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti") - -# prepare image for the model -inputs = image_processor(images=image, return_tensors="pt") - -with torch.no_grad(): - outputs = model(**inputs) - predicted_depth = outputs.predicted_depth - -# interpolate to original size -prediction = torch.nn.functional.interpolate( - predicted_depth.unsqueeze(1), - size=image.size[::-1], - mode="bicubic", - align_corners=False, -) - -# visualize the prediction -output = prediction.squeeze().cpu().numpy() -formatted = (output * 255 / np.max(output)).astype("uint8") -depth = Image.fromarray(formatted) +>>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation +>>> import torch +>>> import numpy as np +>>> from PIL import Image +>>> import requests + +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti") +>>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti") + +>>> # prepare image for the model +>>> inputs = image_processor(images=image, return_tensors="pt") + +>>> with torch.no_grad(): +... outputs = model(pixel_values) + +>>> # interpolate to original size and visualize the prediction +>>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument +>>> ## to `post_process_depth_estimation` to remove the padding and resize to original dimensions. +>>> post_processed_output = image_processor.post_process_depth_estimation( +... outputs, +... source_sizes=[(image.height, image.width)], +... ) + +>>> predicted_depth = post_processed_output[0]["predicted_depth"] +>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min()) +>>> depth = depth.detach().cpu().numpy() * 255 +>>> depth = Image.fromarray(depth.astype("uint8")) ``` + +

In the original implementation ZoeDepth model performs inference on both the original and flipped images and averages out the results. The post_process_depth_estimation function can handle this for us by passing the flipped outputs to the optional outputs_flipped argument:

+
>>> with torch.no_grad():   
+...     outputs = model(pixel_values)
+...     outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+...     outputs_flipped=outputs_flipped,
+... )
+
+ + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ZoeDepth. diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md index e28bc86bc5d95a..3ded3179154aae 100644 --- a/docs/source/en/tasks/monocular_depth_estimation.md +++ b/docs/source/en/tasks/monocular_depth_estimation.md @@ -126,97 +126,34 @@ Pass the prepared inputs through the model: ... outputs = model(pixel_values) ``` -Let's post-process and visualize the results. - -We need to pad and then resize the outputs so that predicted depth map has the same dimension as the original image. After resizing we will remove the padded regions from the depth. +Let's post-process the results to remove any padding and resize the depth map to match the original image size. The `post_process_depth_estimation` outputs a list of dicts containing the `"predicted_depth"`. ```py ->>> import numpy as np ->>> import torch.nn.functional as F - ->>> predicted_depth = outputs.predicted_depth.unsqueeze(dim=1) ->>> height, width = pixel_values.shape[2:] - ->>> height_padding_factor = width_padding_factor = 3 ->>> pad_h = int(np.sqrt(height/2) * height_padding_factor) ->>> pad_w = int(np.sqrt(width/2) * width_padding_factor) - ->>> if predicted_depth.shape[-2:] != pixel_values.shape[-2:]: ->>> predicted_depth = F.interpolate(predicted_depth, size= (height, width), mode='bicubic', align_corners=False) - ->>> if pad_h > 0: - predicted_depth = predicted_depth[:, :, pad_h:-pad_h,:] ->>> if pad_w > 0: - predicted_depth = predicted_depth[:, :, :, pad_w:-pad_w] +>>> # ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument +>>> # to `post_process_depth_estimation` to remove the padding and resize to original dimensions. +>>> post_processed_output = image_processor.post_process_depth_estimation( +... outputs, +... source_sizes=[(image.height, image.width)], +... ) + +>>> predicted_depth = post_processed_output[0]["predicted_depth"] +>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min()) +>>> depth = depth.detach().cpu().numpy() * 255 +>>> depth = Image.fromarray(depth.astype("uint8")) ``` -We can now visualize the results (the function below is taken from the [GaussianObject](https://github.com/GaussianObject/GaussianObject/blob/ad6629efadb57902d5f8bc0fa562258029a4bdf1/pred_monodepth.py#L11) framework). - -```py -import matplotlib - -def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None): - """Converts a depth map to a color image. - - Args: - value (torch.Tensor, numpy.ndarray): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed - vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None. - vmax (float, optional): vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None. - cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'. - invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99. - invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None. - background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255). - gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False. - value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None. - - Returns: - numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4) - """ - if isinstance(value, torch.Tensor): - value = value.detach().cpu().numpy() - - value = value.squeeze() - if invalid_mask is None: - invalid_mask = value == invalid_val - mask = np.logical_not(invalid_mask) - - # normalize - vmin = np.percentile(value[mask],2) if vmin is None else vmin - vmax = np.percentile(value[mask],85) if vmax is None else vmax - if vmin != vmax: - value = (value - vmin) / (vmax - vmin) # vmin..vmax - else: - # Avoid 0-division - value = value * 0. - - # squeeze last dim if it exists - # grey out the invalid values - - value[invalid_mask] = np.nan - cmapper = matplotlib.colormaps.get_cmap(cmap) - if value_transform: - value = value_transform(value) - # value = value / value.max() - value = cmapper(value, bytes=True) # (nxmx4) - - # img = value[:, :, :] - img = value[...] - img[invalid_mask] = background_color - - # return img.transpose((2, 0, 1)) - if gamma_corrected: - # gamma correction - img = img / 255 - img = np.power(img, 2.2) - img = img * 255 - img = img.astype(np.uint8) - return img - ->>> result = colorize(predicted_depth.cpu().squeeze().numpy()) ->>> Image.fromarray(result) -``` - - + +

In the original implementation ZoeDepth model performs inference on both the original and flipped images and averages out the results. The post_process_depth_estimation function can handle this for us by passing the flipped outputs to the optional outputs_flipped argument:

+
>>> with torch.no_grad():   
+...     outputs = model(pixel_values)
+...     outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+...     outputs_flipped=outputs_flipped,
+... )
+
+
Depth estimation visualization diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index e24b38be646665..59c628786328e6 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -413,20 +413,18 @@ def forward( >>> with torch.no_grad(): ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], ... ) >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" loss = None if labels is not None: diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index a263d8a51f424d..20024e5fefa198 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -15,7 +15,11 @@ """Image processor class for DPT.""" import math -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput import numpy as np @@ -37,7 +41,13 @@ valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, + requires_backends, +) if is_torch_available(): @@ -461,3 +471,44 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] return semantic_segmentation + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False + ).squeeze() + + results.append({"predicted_depth": depth}) + + return results diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 1587493643e99d..2d4654a234c2c6 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1121,20 +1121,18 @@ def forward( >>> with torch.no_grad(): ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], ... ) >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" loss = None if labels is not None: diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py index c4314250151852..2211ab07c09d4c 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py @@ -15,10 +15,14 @@ """Image processor class for ZoeDepth.""" import math -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union import numpy as np + +if TYPE_CHECKING: + from .modeling_zoedepth import ZoeDepthDepthEstimatorOutput + from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import PaddingMode, pad, to_channel_dimension_format from ...image_utils import ( @@ -126,10 +130,10 @@ class ZoeDepthImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`. keep_aspect_ratio (`bool`, *optional*, defaults to `True`): - If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it for - both dimensions. This ensures that the image is scaled down as little as possible while still fitting within the - desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a size that is a - multiple of this value by flooring the height and width to the nearest multiple of this value. + If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it + for both dimensions. This ensures that the image is scaled down as little as possible while still fitting + within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a + size that is a multiple of this value by flooring the height and width to the nearest multiple of this value. Can be overidden by `keep_aspect_ratio` in `preprocess`. ensure_multiple_of (`int`, *optional*, defaults to 32): If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring @@ -331,19 +335,21 @@ def preprocess( do_resize (`bool`, *optional*, defaults to `self.do_resize`): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. If `keep_aspect_ratio` is `True`, he image is resized by choosing the smaller of - the height and width scaling factors and using it for both dimensions. If `ensure_multiple_of` is also set, - the image is further resized to a size that is a multiple of this value. + Size of the image after resizing. If `keep_aspect_ratio` is `True`, he image is resized by choosing the + smaller of the height and width scaling factors and using it for both dimensions. If `ensure_multiple_of` + is also set, the image is further resized to a size that is a multiple of this value. keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`): - If `True` and `do_resize=True`, the image is resized by choosing the smaller of the height and width scaling factors and using it for - both dimensions. This ensures that the image is scaled down as little as possible while still fitting within the - desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a size that is a - multiple of this value by flooring the height and width to the nearest multiple of this value. + If `True` and `do_resize=True`, the image is resized by choosing the smaller of the height and width + scaling factors and using it for both dimensions. This ensures that the image is scaled down as little + as possible while still fitting within the desired output size. In case `ensure_multiple_of` is also + set, the image is further resized to a size that is a multiple of this value by flooring the height and + width to the nearest multiple of this value. ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring - the height and width to the nearest multiple of this value. + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by + flooring the height and width to the nearest multiple of this value. - Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by `ensure_multiple_of` in `preprocess`. + Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by + `ensure_multiple_of` in `preprocess`. resample (`int`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only has an effect if `do_resize` is set to `True`. @@ -442,3 +448,111 @@ def preprocess( data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "ZoeDepthDepthEstimatorOutput", + source_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + outputs_flipped: Optional[Union["ZoeDepthDepthEstimatorOutput", None]] = None, + do_remove_padding: Optional[Union[bool, None]] = None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`ZoeDepthDepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`ZoeDepthDepthEstimatorOutput`]): + Raw outputs of the model. + source_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the source size + (height, width) of each image in the batch before preprocessing. This argument should be dealt as + "required" unless the user passes `do_remove_padding=False` as input to this function. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*): + Raw outputs of the model from flipped input (averaged out in the end). + do_remove_padding (`bool`, *optional*): + By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the + boundary artifacts in the output depth map, so we need remove this padding during post_processing. The + parameter exists here in case the user changed the image preprocessing to not include padding. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (outputs_flipped is not None) and (predicted_depth.shape != outputs_flipped.predicted_depth.shape): + raise ValueError("Make sure that `outputs` and `outputs_flipped` have the same shape") + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + if do_remove_padding is None: + do_remove_padding = self.do_pad + + if source_sizes is None and do_remove_padding: + raise ValueError( + "Either `source_sizes` should be passed in, or `do_remove_padding` should be set to False" + ) + + if (source_sizes is not None) and (len(predicted_depth) != len(source_sizes)): + raise ValueError( + "Make sure that you pass in as many source image sizes as the batch dimension of the logits" + ) + + if outputs_flipped is not None: + predicted_depth = (predicted_depth + torch.flip(outputs_flipped.predicted_depth, dims=[-1])) / 2 + + predicted_depth = predicted_depth.unsqueeze(1) + + # Zoe Depth model adds padding around the images to fix the boundary artifacts in the output depth map + # The padding length is `int(np.sqrt(img_h/2) * fh)` for the height and similar for the width + # fh (and fw respectively) are equal to '3' by default + # Check [here](https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L57) + # for the original implementation. + # In this section, we remove this padding to get the final depth image and depth prediction + padding_factor_h = padding_factor_w = 3 + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + source_sizes = [None] * len(predicted_depth) if source_sizes is None else source_sizes + for depth, target_size, source_size in zip(predicted_depth, target_sizes, source_sizes): + # depth.shape = [1, H, W] + if source_size is not None: + pad_h = pad_w = 0 + + if do_remove_padding: + pad_h = int(np.sqrt(source_size[0] / 2) * padding_factor_h) + pad_w = int(np.sqrt(source_size[1] / 2) * padding_factor_w) + + depth = nn.functional.interpolate( + depth.unsqueeze(1), + size=[source_size[0] + 2 * pad_h, source_size[1] + 2 * pad_w], + mode="bicubic", + align_corners=False, + ) + + if pad_h > 0: + depth = depth[:, :, pad_h:-pad_h, :] + if pad_w > 0: + depth = depth[:, :, :, pad_w:-pad_w] + + depth = depth.squeeze(1) + # depth.shape = [1, H, W] + if target_size is not None: + target_size = [target_size[0], target_size[1]] + depth = nn.functional.interpolate( + depth.unsqueeze(1), size=target_size, mode="bicubic", align_corners=False + ) + depth = depth.squeeze() + # depth.shape = [H, W] + results.append({"predicted_depth": depth}) + + return results diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index 2a00487c1b4b90..979b78aba678a5 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -1338,20 +1338,18 @@ def forward( >>> with torch.no_grad(): ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... source_sizes=[(image.height, image.width)], ... ) >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" loss = None if labels is not None: diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py index f70f8d85c15db8..ae86c552a720af 100644 --- a/src/transformers/pipelines/depth_estimation.py +++ b/src/transformers/pipelines/depth_estimation.py @@ -1,9 +1,13 @@ import warnings from typing import List, Union -import numpy as np - -from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) from .base import Pipeline, build_pipeline_init_args @@ -13,8 +17,6 @@ from ..image_utils import load_image if is_torch_available(): - import torch - from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES logger = logging.get_logger(__name__) @@ -114,14 +116,19 @@ def _forward(self, model_inputs): return model_outputs def postprocess(self, model_outputs): - predicted_depth = model_outputs.predicted_depth - prediction = torch.nn.functional.interpolate( - predicted_depth.unsqueeze(1), size=model_outputs["target_size"], mode="bicubic", align_corners=False + outputs = self.image_processor.post_process_depth_estimation( + model_outputs, + # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not* + # replace with `target_sizes = [model_outputs["target_size"]]` + [model_outputs["target_size"]], ) - output = prediction.squeeze().cpu().numpy() - formatted = (output * 255 / np.max(output)).astype("uint8") - depth = Image.fromarray(formatted) - output_dict = {} - output_dict["predicted_depth"] = predicted_depth - output_dict["depth"] = depth - return output_dict + + formatted_outputs = [] + for output in outputs: + depth = output["predicted_depth"].detach().cpu().numpy() + depth = (depth - depth.min()) / (depth.max() - depth.min()) + depth = Image.fromarray((depth * 255).astype("uint8")) + + formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth}) + + return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 5232b4cf462d8c..376ea8b310080d 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -384,3 +384,29 @@ def test_post_processing_semantic_segmentation(self): segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs) expected_shape = torch.Size((480, 480)) self.assertEqual(segmentation[0].shape, expected_shape) + + def test_post_processing_depth_estimation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] + expected_shape = torch.Size((384, 384)) + self.assertTrue(predicted_depth.shape == expected_shape) + + predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) + predicted_depth_l = predicted_depth_l[0]["predicted_depth"] + expected_shape = torch.Size((500, 500)) + self.assertTrue(predicted_depth_l.shape == expected_shape) + + output_enlarged = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False + ).squeeze() + self.assertTrue(output_enlarged.shape == expected_shape) + self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3)) diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py index 571c44f2f47266..a9c1ffb149d8a5 100644 --- a/tests/models/zoedepth/test_modeling_zoedepth.py +++ b/tests/models/zoedepth/test_modeling_zoedepth.py @@ -16,6 +16,8 @@ import unittest +import numpy as np + from transformers import Dinov2Config, ZoeDepthConfig from transformers.file_utils import is_torch_available, is_vision_available from transformers.testing_utils import require_torch, require_vision, slow, torch_device @@ -212,6 +214,25 @@ def prepare_img(): @require_vision @slow class ZoeDepthModelIntegrationTest(unittest.TestCase): + expected_slice_post_processing = { + (False, False): [ + [[1.1348238, 1.1193453, 1.130562], [1.1754476, 1.1613507, 1.1701596], [1.2287744, 1.2101802, 1.2148322]], + [[2.7170, 2.6550, 2.6839], [2.9827, 2.9438, 2.9587], [3.2340, 3.1817, 3.1602]], + ], + (False, True): [ + [[1.0610938, 1.1042216, 1.1429265], [1.1099341, 1.148696, 1.1817775], [1.1656011, 1.1988826, 1.2268101]], + [[2.5848, 2.7391, 2.8694], [2.7882, 2.9872, 3.1244], [2.9436, 3.1812, 3.3188]], + ], + (True, False): [ + [[1.8382794, 1.8380532, 1.8375976], [1.848761, 1.8485023, 1.8479986], [1.8571457, 1.8568444, 1.8562847]], + [[6.2030, 6.1902, 6.1777], [6.2303, 6.2176, 6.2053], [6.2561, 6.2436, 6.2312]], + ], + (True, True): [ + [[1.8306141, 1.8305621, 1.8303483], [1.8410318, 1.8409299, 1.8406585], [1.8492792, 1.8491366, 1.8488203]], + [[6.2616, 6.2520, 6.2435], [6.2845, 6.2751, 6.2667], [6.3065, 6.2972, 6.2887]], + ], + } # (pad, flip) + def test_inference_depth_estimation(self): image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu") model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu").to(torch_device) @@ -255,3 +276,81 @@ def test_inference_depth_estimation_multiple_heads(self): ).to(torch_device) self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) + + def check_target_size( + self, + image_processor, + pad_input, + images, + outputs, + raw_outputs, + raw_outputs_flipped=None, + ): + outputs_large = image_processor.post_process_depth_estimation( + raw_outputs, + [img.size[::-1] for img in images], + outputs_flipped=raw_outputs_flipped, + target_sizes=[tuple(np.array(img.size[::-1]) * 2) for img in images], + do_remove_padding=pad_input, + ) + + for img, out, out_l in zip(images, outputs, outputs_large): + out = out["predicted_depth"] + out_l = out_l["predicted_depth"] + out_l_reduced = torch.nn.functional.interpolate( + out_l.unsqueeze(0).unsqueeze(1), size=img.size[::-1], mode="bicubic", align_corners=False + ) + self.assertTrue((np.array(out_l.shape)[::-1] == np.array(img.size) * 2).all()) + self.assertTrue(torch.allclose(out, out_l_reduced, rtol=2e-2)) + + def check_post_processing_test(self, image_processor, images, model, pad_input=True, flip_aug=True): + inputs = image_processor(images=images, return_tensors="pt", do_pad=pad_input).to(torch_device) + + with torch.no_grad(): + raw_outputs = model(**inputs) + raw_outputs_flipped = None + if flip_aug: + raw_outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3])) + + outputs = image_processor.post_process_depth_estimation( + raw_outputs, + [img.size[::-1] for img in images], + outputs_flipped=raw_outputs_flipped, + do_remove_padding=pad_input, + ) + + expected_slices = torch.tensor(self.expected_slice_post_processing[pad_input, flip_aug]).to(torch_device) + for img, out, expected_slice in zip(images, outputs, expected_slices): + out = out["predicted_depth"] + self.assertTrue(img.size == out.shape[::-1]) + self.assertTrue(torch.allclose(expected_slice, out[:3, :3], rtol=1e-3)) + + self.check_target_size(image_processor, pad_input, images, outputs, raw_outputs, raw_outputs_flipped) + + def test_post_processing_depth_estimation_post_processing_nopad_noflip(self): + images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")] + image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False) + model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device) + + self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=False) + + def test_inference_depth_estimation_post_processing_nopad_flip(self): + images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")] + image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False) + model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device) + + self.check_post_processing_test(image_processor, images, model, pad_input=False, flip_aug=True) + + def test_inference_depth_estimation_post_processing_pad_noflip(self): + images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")] + image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False) + model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device) + + self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=False) + + def test_inference_depth_estimation_post_processing_pad_flip(self): + images = [prepare_img(), Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")] + image_processor = ZoeDepthImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti", keep_aspect_ratio=False) + model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti").to(torch_device) + + self.check_post_processing_test(image_processor, images, model, pad_input=True, flip_aug=True) diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py index ce1f83f53e0285..a905aa8169ba2e 100644 --- a/tests/pipelines/test_pipelines_depth_estimation.py +++ b/tests/pipelines/test_pipelines_depth_estimation.py @@ -129,7 +129,7 @@ def test_large_model_pt(self): # This seems flaky. # self.assertEqual(outputs["depth"], "1a39394e282e9f3b0741a90b9f108977") - self.assertEqual(nested_simplify(outputs["predicted_depth"].max().item()), 29.304) + self.assertEqual(nested_simplify(outputs["predicted_depth"].max().item()), 29.306) self.assertEqual(nested_simplify(outputs["predicted_depth"].min().item()), 2.662) @require_torch From 7a08a772cc6e506d4cfe8604c45155b7d02ee677 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 22 Oct 2024 06:52:23 -0700 Subject: [PATCH 070/385] Qwen2.5 is ExecuTorch Compatible (#34102) Qwen2 is ExecuTorch Compatible Co-authored-by: Guang Yang --- tests/models/qwen2/test_modeling_qwen2.py | 55 +++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 5e5c42d4c56630..1fee3192a64958 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -19,8 +19,10 @@ import unittest import pytest +from packaging import version from transformers import AutoTokenizer, Qwen2Config, is_torch_available, set_seed +from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, require_bitsandbytes, @@ -648,3 +650,56 @@ def test_speculative_generation(self): del model backend_empty_cache(torch_device) gc.collect() + + @slow + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + qwen_model = "Qwen/Qwen2.5-0.5B" + + tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% sugar. I have a jar of 1000 grams of sugar. I use"] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = Qwen2ForCausalLM.from_pretrained( + qwen_model, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompt = ["My favourite condiment is "] + prompt_tokens = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) From c14ccbcd64948478a14a6a55ba2d5d788efa72f9 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 22 Oct 2024 06:53:01 -0700 Subject: [PATCH 071/385] Olmo is ExecuTorch Compatible (#34181) Co-authored-by: Guang Yang --- tests/models/olmo/test_modeling_olmo.py | 64 +++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index e74785e29e9008..fbe73248d00b7c 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -16,9 +16,11 @@ import unittest +from packaging import version from parameterized import parameterized from transformers import OlmoConfig, is_torch_available, set_seed +from transformers.generation.configuration_utils import GenerationConfig from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( @@ -449,3 +451,65 @@ def test_simple_encode_decode(self): self.assertEqual(rust_tokenizer.encode(" "), [50276]) self.assertEqual(rust_tokenizer.encode(" Hello"), [24387]) + + @slow + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + olmo_model = "allenai/OLMo-1B-hf" + + tokenizer = AutoTokenizer.from_pretrained(olmo_model, pad_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = [ + "Simply put, the theory of relativity states that \nthe speed of light is the same in all reference frames.\n\nThe speed of light", + ] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = OlmoForCausalLM.from_pretrained( + olmo_model, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompts = ["Simply put, the theory of relativity states that "] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + eager + eager_generated_ids = model.generate( + **prompt_tokens, max_new_tokens=max_new_tokens, do_sample=False, cache_implementation=cache_implementation + ) + eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text) + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) From eef6b0ba42c062eb8b2180327045c89199ea93f8 Mon Sep 17 00:00:00 2001 From: Michael Kamerath Date: Tue, 22 Oct 2024 07:56:41 -0600 Subject: [PATCH 072/385] Add option for running ffmpeg_microphone_live as a background process (#32838) * Add option for running ffmpeg_microphone_live as a background process * Code quality checks for audio_utils * Code clean up for audio_utils * Fixing logic in ffmpeg_microphone calls in audio_utils * Allowing any arbitrary arguments to be passed to ffmpeg_microphone_live * Formatting * Fixing last problems with adding ffmpeg_additional_args * Fixing default arguments and formatting issues * Fixing comments for ffmpeg_additional_args * Adding two shorts tests for ffmpeg_microphone_live * Fixing test bug --- src/transformers/pipelines/audio_utils.py | 24 ++++++++++++++++++- ..._pipelines_automatic_speech_recognition.py | 10 +++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py index 40a0c0811f85d0..4a8a93c9683a82 100644 --- a/src/transformers/pipelines/audio_utils.py +++ b/src/transformers/pipelines/audio_utils.py @@ -51,6 +51,7 @@ def ffmpeg_microphone( chunk_length_s: float, format_for_conversion: str = "f32le", ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, ): """ Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another @@ -70,6 +71,11 @@ def ffmpeg_microphone( The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + Returns: A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length `int(round(sampling_rate * chunk_length_s)) * size_of_sample`. @@ -95,6 +101,8 @@ def ffmpeg_microphone( format_ = "dshow" input_ = ffmpeg_input_device or _get_microphone_name() + ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args + ffmpeg_command = [ "ffmpeg", "-f", @@ -114,6 +122,9 @@ def ffmpeg_microphone( "quiet", "pipe:1", ] + + ffmpeg_command.extend(ffmpeg_additional_args) + chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample iterator = _ffmpeg_stream(ffmpeg_command, chunk_len) for item in iterator: @@ -127,6 +138,7 @@ def ffmpeg_microphone_live( stride_length_s: Optional[Union[Tuple[float, float], float]] = None, format_for_conversion: str = "f32le", ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, ): """ Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting @@ -153,6 +165,11 @@ def ffmpeg_microphone_live( The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + Return: A generator yielding dictionaries of the following form @@ -168,8 +185,13 @@ def ffmpeg_microphone_live( chunk_s = chunk_length_s microphone = ffmpeg_microphone( - sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device + sampling_rate, + chunk_s, + format_for_conversion=format_for_conversion, + ffmpeg_input_device=ffmpeg_input_device, + ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args, ) + if format_for_conversion == "s16le": dtype = np.int16 size_of_sample = 2 diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 391005b0213709..b21e8cd25f2408 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -33,7 +33,7 @@ WhisperForConditionalGeneration, ) from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline -from transformers.pipelines.audio_utils import chunk_bytes_iter +from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter from transformers.testing_utils import ( compare_pipeline_output_to_hub_spec, @@ -1989,3 +1989,11 @@ def test_chunk_bytes_iter_stride_stream(self): ) with self.assertRaises(StopIteration): next(iter_) + + def test_ffmpeg_no_additional_args(self): + mic = ffmpeg_microphone_live(16000, 2.0) + mic.close() + + def test_ffmpeg_additional_args(self): + mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"]) + mic.close() From 96f67c068b43ef209f1d230d2eda4f1ab27b7550 Mon Sep 17 00:00:00 2001 From: Chinedum Echeta <60179183+cecheta@users.noreply.github.com> Date: Tue, 22 Oct 2024 15:34:17 +0100 Subject: [PATCH 073/385] Feature: Add `MLFLOW_MAX_LOG_PARAMS` to `MLflowCallback` (#34279) --- src/transformers/integrations/integration_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 4f7cf3632fe549..a09116552c8e34 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1218,6 +1218,8 @@ def setup(self, args, state, model): and other parameters are ignored. - **MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`): Whether to flatten the parameters dictionary before logging. + - **MLFLOW_MAX_LOG_PARAMS** (`int`, *optional*): + Set the maximum number of parameters to log in the run. """ self._log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in ENV_VARS_TRUE_VALUES self._nested_run = os.getenv("MLFLOW_NESTED_RUN", "FALSE").upper() in ENV_VARS_TRUE_VALUES @@ -1225,6 +1227,7 @@ def setup(self, args, state, model): self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None) self._flatten_params = os.getenv("MLFLOW_FLATTEN_PARAMS", "FALSE").upper() in ENV_VARS_TRUE_VALUES self._run_id = os.getenv("MLFLOW_RUN_ID", None) + self._max_log_params = os.getenv("MLFLOW_MAX_LOG_PARAMS", None) # "synchronous" flag is only available with mlflow version >= 2.8.0 # https://github.com/mlflow/mlflow/pull/9705 @@ -1273,6 +1276,13 @@ def setup(self, args, state, model): del combined_dict[name] # MLflow cannot log more than 100 values in one go, so we have to split it combined_dict_items = list(combined_dict.items()) + if self._max_log_params and self._max_log_params.isdigit(): + max_log_params = int(self._max_log_params) + if max_log_params < len(combined_dict_items): + logger.debug( + f"Reducing the number of parameters to log from {len(combined_dict_items)} to {max_log_params}." + ) + combined_dict_items = combined_dict_items[:max_log_params] for i in range(0, len(combined_dict_items), self._MAX_PARAMS_TAGS_PER_BATCH): if self._async_log: self._ml_flow.log_params( From e7c3fa7f57ea5df2eedc6c7766ade06d75060904 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:57:44 -0400 Subject: [PATCH 074/385] Fix continue_final_message for image-text-to-text chat templates (#34236) * fix continue_final_message for vlms * Add one test for vlms continue_final_message chat template --- src/transformers/tokenization_utils_base.py | 5 ++++- tests/models/llava/test_processor_llava.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index b52a93ae94841b..16c05a14028eee 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1874,7 +1874,10 @@ def apply_chat_template( **template_kwargs, ) if continue_final_message: - final_message = chat[-1]["content"].strip() + final_message = chat[-1]["content"] + if isinstance(final_message, (list, tuple)): + final_message = final_message[-1]["text"] + final_message = final_message.strip() rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)].rstrip() rendered.append(rendered_chat) diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index 06a18061579670..d3a66a16df9a64 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -93,3 +93,24 @@ def test_chat_template(self): formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted_prompt) + + def test_chat_template_with_continue_final_message(self): + processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + expected_prompt = "USER: \nDescribe this image. ASSISTANT: There is a dog and" + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "Describe this image."}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "There is a dog and"}, + ], + }, + ] + prompt = processor.apply_chat_template(messages, continue_final_message=True) + self.assertEqual(expected_prompt, prompt) From a65a6ce7fece0ec44970b2e142729f33a98ac801 Mon Sep 17 00:00:00 2001 From: Mansu Kim Date: Wed, 23 Oct 2024 01:02:42 +0900 Subject: [PATCH 075/385] fix error in _get_eval_sampler when group_by_length enabled (#34237) * remove self in _get_eval_sampler * remove self in front of _get_eval_sampler --- src/transformers/trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 58a20f66f4e81b..7890e084871a1e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -965,7 +965,7 @@ def get_train_dataloader(self) -> DataLoader: return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]: - if self.eval_dataset is None or not has_length(self.eval_dataset): + if eval_dataset is None or not has_length(eval_dataset): return None # Build the sampler. @@ -986,10 +986,10 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data. return SequentialSampler(eval_dataset) if self.args.group_by_length: - if is_datasets_available() and isinstance(self.eval_dataset, datasets.Dataset): + if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): lengths = ( - self.eval_dataset[self.args.length_column_name] - if self.args.length_column_name in self.eval_dataset.column_names + eval_dataset[self.args.length_column_name] + if self.args.length_column_name in eval_dataset.column_names else None ) else: @@ -997,7 +997,7 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data. model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None return LengthGroupedSampler( self.args.eval_batch_size, - dataset=self.eval_dataset, + dataset=eval_dataset, lengths=lengths, model_input_name=model_input_name, ) From 688eeac81e6491d31746d8bef88429f96540daad Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 23 Oct 2024 00:46:07 +0800 Subject: [PATCH 076/385] [docs] fix typo (#34235) fix typo --- docs/source/en/quicktour.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index f6fc66f4b6cb78..404b6eac7fe44b 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -360,8 +360,8 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a ```py >>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) ``` @@ -369,8 +369,8 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a ```py >>> from transformers import TFAutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) ``` From 4b14aa1bcd0c4bd5fe46c43330ef62d4d0b8f1b4 Mon Sep 17 00:00:00 2001 From: Ahnjj_DEV Date: Wed, 23 Oct 2024 01:46:20 +0900 Subject: [PATCH 077/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`executorch.md`=20to=20Korean=20(#33888)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: executorch.md * Update _toctree.yml * fix: manual edits * Update docs/source/ko/main_classes/executorch.md Co-authored-by: HyeokJun SHIN <96534680+jun048098@users.noreply.github.com> * Update docs/source/ko/_toctree.yml Co-authored-by: Sungmin Oh * Update docs/source/ko/_toctree.yml * Update docs/source/ko/_toctree.yml * Update docs/source/ko/_toctree.yml --------- Co-authored-by: HyeokJun SHIN <96534680+jun048098@users.noreply.github.com> Co-authored-by: Sungmin Oh Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 2 ++ docs/source/ko/main_classes/executorch.md | 33 +++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 docs/source/ko/main_classes/executorch.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 351f89c7891d59..217f49c95ceef5 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -308,6 +308,8 @@ title: Trainer - local: deepspeed title: DeepSpeed + - local: main_classes/executorch + title: ExecuTorch - local: main_classes/feature_extractor title: 특성 추출기 - local: in_translation diff --git a/docs/source/ko/main_classes/executorch.md b/docs/source/ko/main_classes/executorch.md new file mode 100644 index 00000000000000..a94418ece1a7ab --- /dev/null +++ b/docs/source/ko/main_classes/executorch.md @@ -0,0 +1,33 @@ + + + +# ExecuTorch [[executorch]] + +[`ExecuTorch`](https://github.com/pytorch/executorch) 는 웨어러블, 임베디드 장치, 마이크로컨트롤러를 포함한 모바일 및 엣지 장치에서 온디바이스 추론 기능을 가능하게 하는 종합 솔루션입니다. PyTorch 생태계에 속해있으며, 이식성, 생산성, 성능에 중점을 둔 PyTorch 모델 배포를 지원합니다. + +ExecuTorch는 백엔드 위임, 사용자 정의 컴파일러 변환, 메모리 계획 등 모델, 장치 또는 특정 유즈케이스 맞춤 최적화를 수행할 수 있는 진입점을 명확하게 정의합니다. ExecuTorch를 사용해 엣지 장치에서 PyTorch 모델을 실행하는 첫 번째 단계는 모델을 익스포트하는 것입니다. 이 작업은 PyTorch API인 [`torch.export`](https://pytorch.org/docs/stable/export.html)를 사용하여 수행합니다. + + +## ExecuTorch 통합 [[transformers.TorchExportableModuleWithStaticCache]] + +`torch.export`를 사용하여 🤗 Transformers를 익스포트 할 수 있도록 통합 지점이 개발되고 있습니다. 이 통합의 목표는 익스포트뿐만 아니라, 익스포트한 아티팩트가 `ExecuTorch`에서 효율적으로 실행될 수 있도록 더 축소하고 최적화하는 것입니다. 특히 모바일 및 엣지 유즈케이스에 중점을 두고 있습니다. + +[[autodoc]] integrations.executorch.TorchExportableModuleWithStaticCache + - forward + +[[autodoc]] integrations.executorch.convert_and_export_with_cache From b03dc0a87e28601bdfbf2e5380f03e3e3b0fc203 Mon Sep 17 00:00:00 2001 From: Ahnjj_DEV Date: Wed, 23 Oct 2024 01:46:31 +0900 Subject: [PATCH 078/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`bert=20japanese.md`=20to=20Korean=20(#33890)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: bert-japanese.md * Update _toctree.yml * fix: manual edits * Update docs/source/ko/_toctree.yml Co-authored-by: Sungmin Oh * Update docs/source/ko/_toctree.yml Co-authored-by: Sungmin Oh --------- Co-authored-by: Sungmin Oh Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 3 +- docs/source/ko/model_doc/bert-japanese.md | 79 +++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 docs/source/ko/model_doc/bert-japanese.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 217f49c95ceef5..e413416d897fa2 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -330,8 +330,9 @@ title: (번역중) BERT - local: in_translation title: (번역중) BertGeneration + - local: model_doc/bert-japanese + title: 일본어 Bert - local: in_translation - title: (번역중) BertJapanese - local: model_doc/bertweet title: Bertweet - local: in_translation diff --git a/docs/source/ko/model_doc/bert-japanese.md b/docs/source/ko/model_doc/bert-japanese.md new file mode 100644 index 00000000000000..8c21ef3558908e --- /dev/null +++ b/docs/source/ko/model_doc/bert-japanese.md @@ -0,0 +1,79 @@ + + +# 일본어 BERT (BertJapanese) [[bertjapanese]] + +## 개요 [[overview]] + +일본어 문장에 학습된 BERT 모델 입니다. + +각각 서로 다른 토큰화 방법을 사용하는 두 모델: + +- MeCab와 WordPiece를 사용하여 토큰화합니다. 이를 위해 추가 의존성 [fugashi](https://github.com/polm/fugashi)이 필요합니다. (이는 [MeCab](https://taku910.github.io/mecab/)의 래퍼입니다.) +- 문자 단위로 토큰화합니다. + +*MecabTokenizer*를 사용하려면, 의존성을 설치하기 위해 `pip install transformers["ja"]` (또는 소스에서 설치하는 경우 `pip install -e .["ja"]`) 명령을 실행해야 합니다. + +자세한 내용은 [cl-tohoku 리포지토리](https://github.com/cl-tohoku/bert-japanese)에서 확인하세요. + +MeCab과 WordPiece 토큰화를 사용하는 모델 예시: + +```python +>>> import torch +>>> from transformers import AutoModel, AutoTokenizer + +>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese") +>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese") + +>>> ## Input Japanese Text +>>> line = "吾輩は猫である。" + +>>> inputs = tokenizer(line, return_tensors="pt") + +>>> print(tokenizer.decode(inputs["input_ids"][0])) +[CLS] 吾輩 は 猫 で ある 。 [SEP] + +>>> outputs = bertjapanese(**inputs) +``` + +문자 토큰화를 사용하는 모델 예시: + +```python +>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char") +>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char") + +>>> ## Input Japanese Text +>>> line = "吾輩は猫である。" + +>>> inputs = tokenizer(line, return_tensors="pt") + +>>> print(tokenizer.decode(inputs["input_ids"][0])) +[CLS] 吾 輩 は 猫 で あ る 。 [SEP] + +>>> outputs = bertjapanese(**inputs) +``` + + + +이는 토큰화 방법을 제외하고는 BERT와 동일합니다. API 참조 정보는 [BERT 문서](https://huggingface.co/docs/transformers/main/en/model_doc/bert)를 참조하세요. +이 모델은 [cl-tohoku](https://huggingface.co/cl-tohoku)께서 기여하였습니다. + + + + +## BertJapaneseTokenizer + +[[autodoc]] BertJapaneseTokenizer From 644d5287b2807f78ddc8e242cf7c4fb082064e05 Mon Sep 17 00:00:00 2001 From: wony617 <49024958+Jwaminju@users.noreply.github.com> Date: Wed, 23 Oct 2024 01:46:52 +0900 Subject: [PATCH 079/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`model=5Fdoc/bartpho.md`=20to=20Korean=20(#33981)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: model_doc/bartpho.md * feat: nmt draft * Update docs/source/ko/model_doc/bartpho.md * Update docs/source/ko/_toctree.yml Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/model_doc/bartpho.md | 86 +++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/model_doc/bartpho.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index e413416d897fa2..e5efd2774de1fa 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -324,8 +324,8 @@ title: BART - local: in_translation title: (번역중) BARThez - - local: in_translation - title: (번역중) BARTpho + - local: model_doc/bartpho + title: BARTpho - local: in_translation title: (번역중) BERT - local: in_translation diff --git a/docs/source/ko/model_doc/bartpho.md b/docs/source/ko/model_doc/bartpho.md new file mode 100644 index 00000000000000..a323c28152c5d9 --- /dev/null +++ b/docs/source/ko/model_doc/bartpho.md @@ -0,0 +1,86 @@ + + +# BARTpho [[bartpho]] + +## 개요 [[overview]] + +BARTpho 모델은 Nguyen Luong Tran, Duong Minh Le, Dat Quoc Nguyen에 의해 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)에서 제안되었습니다. + +이 논문의 초록은 다음과 같습니다: + +*우리는 BARTpho_word와 BARTpho_syllable의 두 가지 버전으로 BARTpho를 제시합니다. +이는 베트남어를 위해 사전훈련된 최초의 대규모 단일 언어 시퀀스-투-시퀀스 모델입니다. +우리의 BARTpho는 시퀀스-투-시퀀스 디노이징 모델인 BART의 "large" 아키텍처와 사전훈련 방식을 사용하여, 생성형 NLP 작업에 특히 적합합니다. +베트남어 텍스트 요약의 다운스트림 작업 실험에서, +자동 및 인간 평가 모두에서 BARTpho가 강력한 기준인 mBART를 능가하고 최신 성능을 개선했음을 보여줍니다. +우리는 향후 연구 및 베트남어 생성형 NLP 작업의 응용을 촉진하기 위해 BARTpho를 공개합니다.* + +이 모델은 [dqnguyen](https://huggingface.co/dqnguyen)이 기여했습니다. 원본 코드는 [여기](https://github.com/VinAIResearch/BARTpho)에서 찾을 수 있습니다. + +## 사용 예시 [[usage-example]] + +```python +>>> import torch +>>> from transformers import AutoModel, AutoTokenizer + +>>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable") + +>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable") + +>>> line = "Chúng tôi là những nghiên cứu viên." + +>>> input_ids = tokenizer(line, return_tensors="pt") + +>>> with torch.no_grad(): +... features = bartpho(**input_ids) # 이제 모델 출력은 튜플입니다 + +>>> # With TensorFlow 2.0+: +>>> from transformers import TFAutoModel + +>>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable") +>>> input_ids = tokenizer(line, return_tensors="tf") +>>> features = bartpho(**input_ids) +``` + +## 사용 팁 [[usage-tips]] + +- mBART를 따르며, BARTpho는 BART의 "large" 아키텍처에 인코더와 디코더의 상단에 추가적인 레이어 정규화 레이어를 사용합니다. +따라서 [BART 문서](bart)에 있는 사용 예시를 BARTpho에 맞게 적용하려면 +BART 전용 클래스를 mBART 전용 클래스로 대체하여 조정해야 합니다. +예를 들어: + +```python +>>> from transformers import MBartForConditionalGeneration + +>>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable") +>>> TXT = "Chúng tôi là nghiên cứu viên." +>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] +>>> logits = bartpho(input_ids).logits +>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() +>>> probs = logits[0, masked_index].softmax(dim=0) +>>> values, predictions = probs.topk(5) +>>> print(tokenizer.decode(predictions).split()) +``` + +- 이 구현은 토큰화만을 위한 것입니다: "monolingual_vocab_file"은 다국어 + XLM-RoBERTa에서 제공되는 사전훈련된 SentencePiece 모델 + "vocab_file"에서 추출된 베트남어 전용 유형으로 구성됩니다. + 다른 언어들도 이 사전훈련된 다국어 SentencePiece 모델 "vocab_file"을 하위 단어 분할에 사용하면, 자신의 언어 전용 "monolingual_vocab_file"과 함께 BartphoTokenizer를 재사용할 수 있습니다. + +## BartphoTokenizer [[bartphotokenizer]] + +[[autodoc]] BartphoTokenizer From 049682a5a63042f087fb45ff128bfe281b2ff98b Mon Sep 17 00:00:00 2001 From: Vijay Date: Tue, 22 Oct 2024 22:56:16 +0530 Subject: [PATCH 080/385] Example doc for token classification of Llama and Dependent/Copied Models (#34139) * Added Example Doc for token classification on all tokenClassificationModels copied from llama * Refactor code to add code sample docstrings for Gemma and Gemma2 models (including modular Gemma) * Refactor code to update model checkpoint names for Qwen2 models --- src/transformers/models/gemma/modeling_gemma.py | 9 +++++++++ src/transformers/models/gemma/modular_gemma.py | 1 + src/transformers/models/gemma2/modeling_gemma2.py | 9 +++++++++ src/transformers/models/gemma2/modular_gemma2.py | 2 ++ src/transformers/models/llama/modeling_llama.py | 7 +++++++ .../models/mistral/modeling_mistral.py | 7 +++++++ .../models/mixtral/modeling_mixtral.py | 7 +++++++ .../models/nemotron/modeling_nemotron.py | 7 +++++++ .../models/persimmon/modeling_persimmon.py | 14 +++++++++++++- src/transformers/models/qwen2/modeling_qwen2.py | 8 +++++++- .../models/qwen2_moe/modeling_qwen2_moe.py | 8 +++++++- .../models/stablelm/modeling_stablelm.py | 7 +++++++ .../models/starcoder2/modeling_starcoder2.py | 7 +++++++ 13 files changed, 90 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 43882e7f8c0596..6f364ffcf7edd8 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -39,6 +39,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, @@ -48,6 +49,9 @@ from .configuration_gemma import GemmaConfig +_CHECKPOINT_FOR_DOC = "google/gemma-7b" + + class GemmaRMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() @@ -1233,6 +1237,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index ca1de9a880fef5..c3d780bc571ade 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -49,6 +49,7 @@ SPIECE_UNDERLINE = "▁" +_CHECKPOINT_FOR_DOC = "google/gemma-7b" logger = logging.get_logger(__name__) diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 28f5f5da7ba003..467981bb78d025 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -37,6 +37,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal, @@ -47,6 +48,9 @@ from .configuration_gemma2 import Gemma2Config +_CHECKPOINT_FOR_DOC = "google/gemma2-7b" + + class Gemma2RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() @@ -1292,6 +1296,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 9d7f047e1a8494..49010152b81cc5 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -50,6 +50,8 @@ from ...modeling_flash_attention_utils import _flash_attention_forward +_CHECKPOINT_FOR_DOC = "google/gemma2-7b" + logger = logging.get_logger(__name__) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index e9064ff3ae5b22..617ef38e4ae3de 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -41,6 +41,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, @@ -52,6 +53,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf" _CONFIG_FOR_DOC = "LlamaConfig" @@ -1446,6 +1448,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index ef225e15988237..f198e4abc85511 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -40,6 +40,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -55,6 +56,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1" _CONFIG_FOR_DOC = "MistralConfig" @@ -1242,6 +1244,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 3ff851b45ea161..f5f11ba995c802 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -41,6 +41,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import is_torch_greater_or_equal_than_1_13 from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -65,6 +66,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "mistralai/Mixtral-8x7B-v0.1" _CONFIG_FOR_DOC = "MixtralConfig" @@ -1468,6 +1470,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 6cac7ecdfbe5d9..d5470dbbaa1904 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -39,6 +39,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_greater_or_equal_2_10, @@ -50,6 +51,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "nvidia/nemotron-3-8b-base-4k-hf" _CONFIG_FOR_DOC = "NemotronConfig" @@ -1323,6 +1325,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index ddd26729164df2..cd580ab0dc0f8c 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -39,12 +39,19 @@ ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) from .configuration_persimmon import PersimmonConfig logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "adept/persimmon-8b-base" _CONFIG_FOR_DOC = "PersimmonConfig" @@ -1120,6 +1127,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 8bd552e66ecbe1..a6e4d12d799b07 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -41,6 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -58,7 +59,7 @@ logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B" _CONFIG_FOR_DOC = "Qwen2Config" @@ -1348,6 +1349,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 60cd5e4722857e..d482316b5b8bac 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -41,6 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -56,7 +57,7 @@ logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B" +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-57B-A14B" _CONFIG_FOR_DOC = "Qwen2MoeConfig" @@ -1533,6 +1534,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index a2356258ce38ed..004e4ff3f6c030 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -40,6 +40,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -56,6 +57,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "stabilityai/stablelm-3b-4e1t" _CONFIG_FOR_DOC = "StableLmConfig" @@ -1396,6 +1398,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index b81dac38c7ea55..c8f22dee43fe2c 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -40,6 +40,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -56,6 +57,7 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "bigcode/starcoder2-7b" _CONFIG_FOR_DOC = "Starcoder2Config" @@ -1316,6 +1318,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, From 5ba85de7a434dc6d81c4d2f9ece0ad930c66d043 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Wed, 23 Oct 2024 01:52:51 -0700 Subject: [PATCH 081/385] [docs] Fix Korean toctree (#34324) fix --- docs/source/ko/_toctree.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index e5efd2774de1fa..27102f123dd871 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -332,7 +332,6 @@ title: (번역중) BertGeneration - local: model_doc/bert-japanese title: 일본어 Bert - - local: in_translation - local: model_doc/bertweet title: Bertweet - local: in_translation From 343c8cb86f2ab6a51e7363ee11f69afb1c9e839e Mon Sep 17 00:00:00 2001 From: Filippos Ventirozos <58438004+FilipposVentirozos@users.noreply.github.com> Date: Wed, 23 Oct 2024 10:15:36 +0100 Subject: [PATCH 082/385] Added Deberta model type support (#34308) * Added Deberta model type for 'add_prefix_space' functionality * housekeeping --------- Co-authored-by: Filippos Ventirozos --- examples/pytorch/token-classification/run_ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index ef1c0ac917b767..d2a4c3dabfd63c 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -366,7 +366,7 @@ def get_label_list(labels): ) tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path - if config.model_type in {"bloom", "gpt2", "roberta"}: + if config.model_type in {"bloom", "gpt2", "roberta", "deberta"}: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, cache_dir=model_args.cache_dir, From 1fb575fcf0ed6d1895a21e1c28b10b6287bd28ec Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:48:21 +0200 Subject: [PATCH 083/385] Support boolean tool args (#34208) Support boolean tool arguments --- src/transformers/agents/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py index cfb1e4cf95ced9..a425ffc8f106b2 100644 --- a/src/transformers/agents/tools.py +++ b/src/transformers/agents/tools.py @@ -138,7 +138,7 @@ def validate_arguments(self): "inputs": Dict, "output_type": str, } - authorized_types = ["string", "integer", "number", "image", "audio", "any"] + authorized_types = ["string", "integer", "number", "image", "audio", "any", "boolean"] for attr, expected_type in required_attributes.items(): attr_value = getattr(self, attr, None) From d9f733625c43158f3fa52377f2f8bf49350160f3 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Oct 2024 11:24:57 -0400 Subject: [PATCH 084/385] Enable Gradient Accumulation fix across all models + trainer fully in forward() (#34283) * Enable grad accum fix across all models + trainer fully in forward() * handle peft case * Account for DDP: need to run scale tests * Use accelerator state * Quality * Guard * Experiment w/ only fairseq fix * Fairseq only * Revert multiply_grads fix * Mult by grad accum to fully bring back solution * Style * Good to go now * Skip fx tests for now * Bookmark * Working now --- .../models/cohere/modeling_cohere.py | 3 +- .../models/gemma/modeling_gemma.py | 3 +- .../models/gemma/modular_gemma.py | 3 +- .../models/gemma2/modeling_gemma2.py | 3 +- .../models/gemma2/modular_gemma2.py | 3 +- src/transformers/models/glm/modeling_glm.py | 3 +- .../models/jamba/modeling_jamba.py | 3 +- .../models/mixtral/modeling_mixtral.py | 3 +- .../models/mllama/modeling_mllama.py | 3 +- .../models/nemotron/modeling_nemotron.py | 3 +- src/transformers/models/olmo/modeling_olmo.py | 3 +- .../models/olmoe/modeling_olmoe.py | 3 +- src/transformers/models/phi/modeling_phi.py | 3 +- src/transformers/models/phi3/modeling_phi3.py | 3 +- .../models/phimoe/modeling_phimoe.py | 3 +- .../models/qwen2/modeling_qwen2.py | 3 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 3 +- .../models/rt_detr/modeling_rt_detr.py | 2 ++ .../models/zamba/modeling_zamba.py | 3 +- src/transformers/trainer.py | 36 ++++++++++++------- tests/models/cohere/test_modeling_cohere.py | 4 +++ tests/models/mistral/test_modeling_mistral.py | 4 +++ tests/models/mixtral/test_modeling_mixtral.py | 4 +++ tests/models/qwen2/test_modeling_qwen2.py | 4 +++ .../qwen2_moe/test_modeling_qwen2_moe.py | 4 +++ 25 files changed, 81 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 3abe6ef8644500..9aa588be431029 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -1114,6 +1114,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1172,7 +1173,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 6f364ffcf7edd8..9a4de1022c57e9 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -1030,6 +1030,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1087,7 +1088,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index c3d780bc571ade..807f91ff9e6baa 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -961,6 +961,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" ```python @@ -1003,7 +1004,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 467981bb78d025..6d61c47619f304 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -1002,6 +1002,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1068,7 +1069,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 49010152b81cc5..7ddb1c9f4c99e7 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -756,6 +756,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" ```python @@ -807,7 +808,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index a458c02a6feda7..aad4da282b7878 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -1014,6 +1014,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1071,7 +1072,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 737be17cfc1694..32ae6ea02eba5b 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1450,6 +1450,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: Optional[Union[int, None]] = None, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1515,7 +1516,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index f5f11ba995c802..192b7801af0575 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1240,6 +1240,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1303,7 +1304,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index c5ae615a12b5cc..8ce6150a2fa2f8 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1887,6 +1887,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1949,7 +1950,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index d5470dbbaa1904..d4eb348260c1a4 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -1028,6 +1028,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1085,7 +1086,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 6c7dc59cdbff38..60225d4759c6ab 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -1068,6 +1068,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1126,7 +1127,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 32f7ded42e8901..cbb8db0f59dd02 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1228,6 +1228,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1290,7 +1291,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index ef1a5b4d0ec243..4613672ff2740b 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1192,6 +1192,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1250,7 +1251,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 16601e1f9957d5..9e638c27afa41d 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1209,6 +1209,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1275,7 +1276,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 559daeca694dbe..791f6df50bb40f 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1377,6 +1377,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1442,7 +1443,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index a6e4d12d799b07..0d97f2ffb724a0 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1121,6 +1121,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1179,7 +1180,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index d482316b5b8bac..36de586265ce60 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1305,6 +1305,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1367,7 +1368,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 1c09025a34b140..cae48455047ed7 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -2027,6 +2027,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **loss_kwargs, ) -> Union[Tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]: r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2128,6 +2129,7 @@ def forward( enc_topk_logits=enc_topk_logits, enc_topk_bboxes=enc_topk_bboxes, denoising_meta_values=denoising_meta_values, + **loss_kwargs, ) if not return_dict: diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 921d07f287dca5..dee7f898fcf93a 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1418,6 +1418,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1477,7 +1478,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7890e084871a1e..1b13787007e9c3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -582,6 +582,16 @@ def __init__( self.model_wrapped = model self.model = model + # Just in case the model was wrapped outside of the `Trainer` + unwrapped_model = self.accelerator.unwrap_model(model) + model_forward = ( + unwrapped_model.forward + if not _is_peft_model(unwrapped_model) + else unwrapped_model.get_base_model().forward + ) + + self.model_accepts_loss_kwargs = "loss_kwargs" in inspect.signature(model_forward).parameters + self.neftune_noise_alpha = args.neftune_noise_alpha self.compute_metrics = compute_metrics @@ -2417,8 +2427,14 @@ def _inner_training_loop( for inputs in batch_samples: step += 1 total_batched_samples += 1 + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) + do_sync_step = is_last_step_and_steps_less_than_grad_acc or ( + total_batched_samples % args.gradient_accumulation_steps == 0 + ) # Since we perform prefetching, we need to manually set sync_gradients - if total_batched_samples % args.gradient_accumulation_steps != 0: + if not do_sync_step: self.accelerator.gradient_state._set_sync_gradients(False) else: self.accelerator.gradient_state._set_sync_gradients(True) @@ -2473,16 +2489,7 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) - - if ( - (total_batched_samples) % args.gradient_accumulation_steps == 0 - or - # last step in epoch but step is always smaller than gradient_accumulation_steps - is_last_step_and_steps_less_than_grad_acc - ): + if do_sync_step: # Since we perform prefetching, we need to manually set sync_gradients to True self.accelerator.gradient_state._set_sync_gradients(True) @@ -3610,8 +3617,11 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs.pop("labels") else: labels = None - # if num_items_in_batch is not None: - # inputs["num_items_in_batch"] = num_items_in_batch + if self.model_accepts_loss_kwargs: + loss_kwargs = {} + if num_items_in_batch is not None: + loss_kwargs["num_items_in_batch"] = num_items_in_batch + inputs = {**inputs, **loss_kwargs} outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 7d12dd3d873bfc..b8a5aec9d4153a 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -304,6 +304,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + @require_bitsandbytes @require_torch_sdpa @require_torch_multi_gpu diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index ff7f1e87bc1972..13e5e3d1f609e9 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -356,6 +356,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Mistral_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0e6b2a999e89a9..0bfb5126ebd1ca 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -356,6 +356,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Mixtral_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 1fee3192a64958..769d6caabd92f4 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -368,6 +368,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Qwen2_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index d7b17b740f9e85..374d9472ca2793 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -391,6 +391,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Qwen2Moe_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) From c42b3223db0fc24ff9a694f19e6c78faf3ac58a1 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:27:51 +0200 Subject: [PATCH 085/385] skip `test_pipeline_depth_estimation` temporarily (#34316) skip Co-authored-by: ydshieh --- tests/models/glpn/test_modeling_glpn.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 81e95ab244f9aa..254c1135357147 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -157,6 +157,14 @@ def setUp(self): self.model_tester = GLPNModelTester(self) self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig) + @unittest.skip(reason="Failing after #32550") + def test_pipeline_depth_estimation(self): + pass + + @unittest.skip(reason="Failing after #32550") + def test_pipeline_depth_estimation_fp16(self): + pass + def test_config(self): self.config_tester.run_common_tests() From e50bf61decf741c6d59e4ba633b7392712673bda Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 23 Oct 2024 18:33:52 +0200 Subject: [PATCH 086/385] Fix red CI: benchmark script (#34351) * dont'trigger always * fux * oups * update * ?? * ? * aie --- .github/workflows/benchmark.yml | 12 ++--- scripts/deberta_scrtipt.py | 82 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 scripts/deberta_scrtipt.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c264dfe462aae7..79f0652e192f2a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,21 +18,17 @@ jobs: name: Benchmark runs-on: group: aws-g5-4xlarge-cache + if: | + (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )|| + (github.event_name == 'push' && github.ref == 'refs/heads/main') container: image: huggingface/transformers-pytorch-gpu options: --gpus all --privileged --ipc host steps: - name: Get repo - if: github.event_name == 'pull_request' uses: actions/checkout@v4 with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: Get repo - if: github.event_name == 'push' - uses: actions/checkout@v4 - with: - ref: ${{ github.sha }} + ref: ${{ github.event.pull_request.head.sha || github.sha }} - name: Install libpq-dev & psql run: | diff --git a/scripts/deberta_scrtipt.py b/scripts/deberta_scrtipt.py new file mode 100644 index 00000000000000..b910d8de3f52b5 --- /dev/null +++ b/scripts/deberta_scrtipt.py @@ -0,0 +1,82 @@ +import torch +from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForMaskedLM +import time + +test_sentence = 'Do you [MASK] the muffin man?' + +# for comparison +bert = pipeline('fill-mask', model = 'bert-base-uncased') +print('\n'.join([d['sequence'] for d in bert(test_sentence)])) + + +deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False}) +print('\n'.join([d['sequence'] for d in deberta(test_sentence)])) + + +tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") + +tokenized_dict = tokenizer( + ["Is this working",], ["Not yet",], + return_tensors="pt" +) + +deberta.model.forward = torch.compile(deberta.model.forward) +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +model = AutoModel.from_pretrained('microsoft/deberta-base') +model.config.return_dict = False +model.config.output_hidden_states=False +input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask']) + + +start=time.time() +traced_model = torch.jit.trace(model, input_tuple) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +torch.jit.save(traced_model, "compiled_deberta.pt") + + + +# my_script_module = torch.jit.script(model) From b0f0c61899019d316db17a493023828aa44db06d Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 23 Oct 2024 21:18:52 +0100 Subject: [PATCH 087/385] Add SynthID (watermerking by Google DeepMind) (#34350) * Add SynthIDTextWatermarkLogitsProcessor * esolving comments. * Resolving comments. * esolving commits, * Improving SynthIDWatermark tests. * switch to PT version * detector as pretrained model + style * update training + style * rebase * Update logits_process.py * Improving SynthIDWatermark tests. * Shift detector training to wikitext negatives and stabilize with lower learning rate. * Clean up. * in for 7B * cleanup * upport python 3.8. * README and final cleanup. * HF Hub upload and initiaze. * Update requirements for synthid_text. * Adding SynthIDTextWatermarkDetector. * Detector testing. * Documentation changes. * Copyrights fix. * Fix detector api. * ironing out errors * ironing out errors * training checks * make fixup and make fix-copies * docstrings and add to docs * copyright * BC * test docstrings * move import * protect type hints * top level imports * watermarking example * direct imports * tpr fpr meaning * process_kwargs * SynthIDTextWatermarkingConfig docstring * assert -> exception * example updates * no immutable dict (cant be serialized) * pack fn * einsum equivalent * import order * fix test on gpu * add detector example --------- Co-authored-by: Sumedh Ghaisas Co-authored-by: Marc Sun Co-authored-by: sumedhghaisas2 <138781311+sumedhghaisas2@users.noreply.github.com> Co-authored-by: raushan --- docs/source/en/internal/generation_utils.md | 18 + .../source/en/main_classes/text_generation.md | 2 - .../research_projects/synthid_text/README.md | 34 ++ .../synthid_text/detector_training.py | 502 ++++++++++++++++++ .../synthid_text/requirements.txt | 5 + .../research_projects/synthid_text/utils.py | 408 ++++++++++++++ src/transformers/__init__.py | 10 + src/transformers/generation/__init__.py | 24 +- .../generation/configuration_utils.py | 221 ++++++-- src/transformers/generation/logits_process.py | 478 ++++++++++++++++- src/transformers/generation/utils.py | 11 +- src/transformers/generation/watermarking.py | 322 ++++++++++- src/transformers/utils/dummy_pt_objects.py | 35 ++ tests/generation/test_logits_process.py | 186 +++++++ tests/generation/test_utils.py | 62 ++- 15 files changed, 2238 insertions(+), 80 deletions(-) create mode 100644 examples/research_projects/synthid_text/README.md create mode 100644 examples/research_projects/synthid_text/detector_training.py create mode 100644 examples/research_projects/synthid_text/requirements.txt create mode 100644 examples/research_projects/synthid_text/utils.py diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index a81d202c6634af..946940cb019481 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -185,6 +185,9 @@ generation. [[autodoc]] SuppressTokensLogitsProcessor - __call__ +[[autodoc]] SynthIDTextWatermarkLogitsProcessor + - __call__ + [[autodoc]] TemperatureLogitsWarper - __call__ @@ -418,5 +421,20 @@ A [`Constraint`] can be used to force the generation to include specific tokens ## Watermark Utils +[[autodoc]] WatermarkingConfig + - __call__ + [[autodoc]] WatermarkDetector - __call__ + +[[autodoc]] BayesianDetectorConfig + - __call__ + +[[autodoc]] BayesianDetectorModel + - __call__ + +[[autodoc]] SynthIDTextWatermarkingConfig + - __call__ + +[[autodoc]] SynthIDTextWatermarkDetector + - __call__ diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md index 574e4c75a6ac8a..76a0f1381cd6bc 100644 --- a/docs/source/en/main_classes/text_generation.md +++ b/docs/source/en/main_classes/text_generation.md @@ -41,8 +41,6 @@ like token streaming. - validate - get_generation_mode -[[autodoc]] generation.WatermarkingConfig - ## GenerationMixin [[autodoc]] GenerationMixin diff --git a/examples/research_projects/synthid_text/README.md b/examples/research_projects/synthid_text/README.md new file mode 100644 index 00000000000000..30ab999037374d --- /dev/null +++ b/examples/research_projects/synthid_text/README.md @@ -0,0 +1,34 @@ +# SynthID Text + +This project showcases the use of SynthIDText for watermarking LLMs. The code shown in this repo also +demostrates the training of the detector for detecting such watermarked text. This detector can be uploaded onto +a private HF hub repo (private for security reasons) and can be initialized again through pretrained model loading also shown in this script. + +See our blog post: https://huggingface.co/blog/synthid-text + + +## Python version + +User would need python 3.9 to run this example. + +## Installation and running + +Once you install transformers you would need to install requirements for this project through requirements.txt provided in this folder. + +``` +pip install -r requirements.txt +``` + +## To run the detector training + +``` +python detector_training.py --model_name=google/gemma-7b-it +``` + +Check the script for more parameters are are tunable and check out paper at link +https://www.nature.com/articles/s41586-024-08025-4 for more information on these parameters. + +## Caveat + +Make sure to run the training of the detector and the detection on the same hardware +CPU, GPU or TPU to get consistent results (we use detecterministic randomness which is hardware dependent). diff --git a/examples/research_projects/synthid_text/detector_training.py b/examples/research_projects/synthid_text/detector_training.py new file mode 100644 index 00000000000000..35d0ea22f42b23 --- /dev/null +++ b/examples/research_projects/synthid_text/detector_training.py @@ -0,0 +1,502 @@ +# coding=utf-8 +# Copyright 2024 Google DeepMind. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import dataclasses +import enum +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import torch + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BayesianDetectorConfig, + BayesianDetectorModel, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, +) +from utils import ( + get_tokenized_uwm_outputs, + get_tokenized_wm_outputs, + process_raw_model_outputs, + update_fn_if_fpr_tpr, + upload_model_to_hf, +) + + +@enum.unique +class ValidationMetric(enum.Enum): + """Direction along the z-axis.""" + + TPR_AT_FPR = "tpr_at_fpr" + CROSS_ENTROPY = "cross_entropy" + + +@dataclasses.dataclass +class TrainingArguments: + """Training arguments pertaining to the training loop itself.""" + + eval_metric: Optional[str] = dataclasses.field( + default=ValidationMetric.TPR_AT_FPR, metadata={"help": "The evaluation metric used."} + ) + + +def train_detector( + detector: torch.nn.Module, + g_values: torch.Tensor, + mask: torch.Tensor, + watermarked: torch.Tensor, + epochs: int = 250, + learning_rate: float = 1e-3, + minibatch_size: int = 64, + seed: int = 0, + l2_weight: float = 0.0, + shuffle: bool = True, + g_values_val: Optional[torch.Tensor] = None, + mask_val: Optional[torch.Tensor] = None, + watermarked_val: Optional[torch.Tensor] = None, + verbose: bool = False, + validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR, +) -> Tuple[Dict[str, Any], float]: + """Trains a Bayesian detector model. + + Args: + g_values: g-values of shape [num_train, seq_len, watermarking_depth]. + mask: A binary array shape [num_train, seq_len] indicating which g-values + should be used. g-values with mask value 0 are discarded. + watermarked: A binary array of shape [num_train] indicating whether the + example is watermarked (0: unwatermarked, 1: watermarked). + epochs: Number of epochs to train for. + learning_rate: Learning rate for optimizer. + minibatch_size: Minibatch size for training. Note that a minibatch + requires ~ 32 * minibatch_size * seq_len * watermarked_depth * + watermarked_depth bits of memory. + seed: Seed for parameter initialization. + l2_weight: Weight to apply to L2 regularization for delta parameters. + shuffle: Whether to shuffle before training. + g_values_val: Validation g-values of shape [num_val, seq_len, + watermarking_depth]. + mask_val: Validation mask of shape [num_val, seq_len]. + watermarked_val: Validation watermark labels of shape [num_val]. + verbose: Boolean indicating verbosity of training. If true, the loss will + be printed. Defaulted to False. + use_tpr_fpr_for_val: Whether to use TPR@FPR=1% as metric for validation. + If false, use cross entropy loss. + + Returns: + Tuple of + training_history: Training history keyed by epoch number where the + values are + dictionaries containing the loss, validation loss, and model + parameters, + keyed by + 'loss', 'val_loss', and 'params', respectively. + min_val_loss: Minimum validation loss achieved during training. + """ + + # Set the random seed for reproducibility + torch.manual_seed(seed) + + # Shuffle the data if required + if shuffle: + indices = torch.randperm(len(g_values)) + g_values = g_values[indices] + mask = mask[indices] + watermarked = watermarked[indices] + + # Initialize optimizer + optimizer = torch.optim.Adam(detector.parameters(), lr=learning_rate) + history = {} + min_val_loss = float("inf") + + for epoch in range(epochs): + losses = [] + detector.train() + num_batches = len(g_values) // minibatch_size + for i in range(0, len(g_values), minibatch_size): + end = i + minibatch_size + if end > len(g_values): + break + loss_batch_weight = l2_weight / num_batches + + optimizer.zero_grad() + loss = detector( + g_values=g_values[i:end], + mask=mask[i:end], + labels=watermarked[i:end], + loss_batch_weight=loss_batch_weight, + )[1] + loss.backward() + optimizer.step() + losses.append(loss.item()) + train_loss = sum(losses) / len(losses) + + val_losses = [] + if g_values_val is not None: + detector.eval() + if validation_metric == ValidationMetric.TPR_AT_FPR: + val_loss = update_fn_if_fpr_tpr( + detector, + g_values_val, + mask_val, + watermarked_val, + minibatch_size=minibatch_size, + ) + else: + for i in range(0, len(g_values_val), minibatch_size): + end = i + minibatch_size + if end > len(g_values_val): + break + with torch.no_grad(): + v_loss = detector( + g_values=g_values_val[i:end], + mask=mask_val[i:end], + labels=watermarked_val[i:end], + loss_batch_weight=0, + )[1] + val_losses.append(v_loss.item()) + val_loss = sum(val_losses) / len(val_losses) + + # Store training history + history[epoch + 1] = {"loss": train_loss, "val_loss": val_loss} + if verbose: + if val_loss is not None: + print(f"Epoch {epoch}: loss {loss} (train), {val_loss} (val)") + else: + print(f"Epoch {epoch}: loss {loss} (train)") + + if val_loss is not None and val_loss < min_val_loss: + min_val_loss = val_loss + best_val_epoch = epoch + + if verbose: + print(f"Best val Epoch: {best_val_epoch}, min_val_loss: {min_val_loss}") + + return history, min_val_loss + + +def train_best_detector( + tokenized_wm_outputs: Union[List[np.ndarray], np.ndarray], + tokenized_uwm_outputs: Union[List[np.ndarray], np.ndarray], + logits_processor: SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + torch_device: torch.device, + test_size: float = 0.3, + pos_truncation_length: Optional[int] = 200, + neg_truncation_length: Optional[int] = 100, + max_padded_length: int = 2300, + n_epochs: int = 50, + learning_rate: float = 2.1e-2, + l2_weights: np.ndarray = np.logspace(-3, -2, num=4), + verbose: bool = False, + validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR, +): + """Train and return the best detector given range of hyperparameters. + + In practice, we have found that tuning pos_truncation_length, + neg_truncation_length, n_epochs, learning_rate and l2_weights can help + improve the performance of the detector. We reccommend tuning these + parameters for your data. + """ + l2_weights = list(l2_weights) + + ( + train_g_values, + train_masks, + train_labels, + cv_g_values, + cv_masks, + cv_labels, + ) = process_raw_model_outputs( + logits_processor, + tokenizer, + pos_truncation_length, + neg_truncation_length, + max_padded_length, + tokenized_wm_outputs, + test_size, + tokenized_uwm_outputs, + torch_device, + ) + + best_detector = None + lowest_loss = float("inf") + val_losses = [] + for l2_weight in l2_weights: + config = BayesianDetectorConfig(watermarking_depth=len(logits_processor.keys)) + detector = BayesianDetectorModel(config).to(torch_device) + _, min_val_loss = train_detector( + detector=detector, + g_values=train_g_values, + mask=train_masks, + watermarked=train_labels, + g_values_val=cv_g_values, + mask_val=cv_masks, + watermarked_val=cv_labels, + learning_rate=learning_rate, + l2_weight=l2_weight, + epochs=n_epochs, + verbose=verbose, + validation_metric=validation_metric, + ) + val_losses.append(min_val_loss) + if min_val_loss < lowest_loss: + lowest_loss = min_val_loss + best_detector = detector + return best_detector, lowest_loss + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + type=str, + default="google/gemma-2b-it", + help=("LM model to train the detector for."), + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help=("Temperature to sample from the model."), + ) + parser.add_argument( + "--top_k", + type=int, + default=40, + help=("Top K for sampling."), + ) + parser.add_argument( + "--top_p", + type=float, + default=1.0, + help=("Top P for sampling."), + ) + parser.add_argument( + "--num_negatives", + type=int, + default=10000, + help=("Number of negatives for detector training."), + ) + parser.add_argument( + "--pos_batch_size", + type=int, + default=32, + help=("Batch size of watermarked positives while sampling."), + ) + parser.add_argument( + "--num_pos_batch", + type=int, + default=313, + help=("Number of positive batches for training."), + ) + parser.add_argument( + "--generation_length", + type=int, + default=512, + help=("Generation length for sampling."), + ) + parser.add_argument( + "--save_model_to_hf_hub", + action="store_true", + help=("Whether to save the trained model HF hub. By default it will be a private repo."), + ) + parser.add_argument( + "--load_from_hf_hub", + action="store_true", + help=( + "Whether to load trained detector model from HF Hub, make sure its the model trained on the same model " + "we are loading in the script." + ), + ) + parser.add_argument( + "--hf_hub_model_name", + type=str, + default=None, + help=("HF hub model name for loading of saving the model."), + ) + parser.add_argument( + "--eval_detector_on_prompts", + action="store_true", + help=("Evaluate detector on a prompt and print probability of watermark."), + ) + + args = parser.parse_args() + model_name = args.model_name + temperature = args.temperature + top_k = args.top_k + top_p = args.top_p + num_negatives = args.num_negatives + pos_batch_size = args.pos_batch_size + num_pos_batch = args.num_pos_batch + if num_pos_batch < 10: + raise ValueError("--num_pos_batch should be greater than 10.") + generation_length = args.generation_length + save_model_to_hf_hub = args.save_model_to_hf_hub + load_from_hf_hub = args.load_from_hf_hub + repo_name = args.hf_hub_model_name + eval_detector_on_prompts = args.eval_detector_on_prompts + + NEG_BATCH_SIZE = 32 + + # Truncate outputs to this length for training. + POS_TRUNCATION_LENGTH = 200 + NEG_TRUNCATION_LENGTH = 100 + # Pad trucated outputs to this length for equal shape across all batches. + MAX_PADDED_LENGTH = 1000 + + DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + if DEVICE.type not in ("cuda", "tpu"): + raise ValueError("We have found the training stable on GPU and TPU, we are working on" " a fix for CPUs") + + model = None + if not load_from_hf_hub: + # Change this to make your watermark unique. Check documentation in the paper to understand the + # impact of these parameters. + DEFAULT_WATERMARKING_CONFIG = { + "ngram_len": 5, # This corresponds to H=4 context window size in the paper. + "keys": [ + 654, + 400, + 836, + 123, + 340, + 443, + 597, + 160, + 57, + 29, + 590, + 639, + 13, + 715, + 468, + 990, + 966, + 226, + 324, + 585, + 118, + 504, + 421, + 521, + 129, + 669, + 732, + 225, + 90, + 960, + ], + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 1024, + } + watermark_config = SynthIDTextWatermarkingConfig(**DEFAULT_WATERMARKING_CONFIG) + + model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**DEFAULT_WATERMARKING_CONFIG, device=DEVICE) + tokenized_wm_outputs = get_tokenized_wm_outputs( + model, + tokenizer, + watermark_config, + num_pos_batch, + pos_batch_size, + temperature, + generation_length, + top_k, + top_p, + DEVICE, + ) + tokenized_uwm_outputs = get_tokenized_uwm_outputs(num_negatives, NEG_BATCH_SIZE, tokenizer, DEVICE) + + best_detector, lowest_loss = train_best_detector( + tokenized_wm_outputs=tokenized_wm_outputs, + tokenized_uwm_outputs=tokenized_uwm_outputs, + logits_processor=logits_processor, + tokenizer=tokenizer, + torch_device=DEVICE, + test_size=0.3, + pos_truncation_length=POS_TRUNCATION_LENGTH, + neg_truncation_length=NEG_TRUNCATION_LENGTH, + max_padded_length=MAX_PADDED_LENGTH, + n_epochs=100, + learning_rate=3e-3, + l2_weights=[ + 0, + ], + verbose=True, + validation_metric=ValidationMetric.TPR_AT_FPR, + ) + else: + if repo_name is None: + raise ValueError("When loading from pretrained detector model name cannot be None.") + best_detector = BayesianDetectorModel.from_pretrained(repo_name).to(DEVICE) + + best_detector.config.set_detector_information( + model_name=model_name, watermarking_config=DEFAULT_WATERMARKING_CONFIG + ) + if save_model_to_hf_hub: + upload_model_to_hf(best_detector, repo_name) + + # Evaluate model response with the detector + if eval_detector_on_prompts: + model_name = best_detector.config.model_name + watermark_config_dict = best_detector.config.watermarking_config + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermark_config_dict, device=DEVICE) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + synthid_text_detector = SynthIDTextWatermarkDetector(best_detector, logits_processor, tokenizer) + + if model is None: + model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) + watermarking_config = SynthIDTextWatermarkingConfig(**watermark_config_dict) + + prompts = ["Write a essay on cats."] + inputs = tokenizer( + prompts, + return_tensors="pt", + padding=True, + ).to(DEVICE) + + _, inputs_len = inputs["input_ids"].shape + + outputs = model.generate( + **inputs, + watermarking_config=watermarking_config, + do_sample=True, + max_length=inputs_len + generation_length, + temperature=temperature, + top_k=40, + top_p=1.0, + ) + outputs = outputs[:, inputs_len:] + result = synthid_text_detector(outputs) + + # You should set this based on expected fpr (false positive rate) and tpr (true positive rate). + # Check our demo at HF Spaces for more info. + upper_threshold = 0.95 + lower_threshold = 0.12 + if result[0][0] > upper_threshold: + print("The text is watermarked.") + elif lower_threshold < result[0][0] < upper_threshold: + print("It is hard to determine if the text is watermarked or not.") + else: + print("The text is not watermarked.") diff --git a/examples/research_projects/synthid_text/requirements.txt b/examples/research_projects/synthid_text/requirements.txt new file mode 100644 index 00000000000000..9e40a93ee08f09 --- /dev/null +++ b/examples/research_projects/synthid_text/requirements.txt @@ -0,0 +1,5 @@ +tensorflow-datasets>=4.9.3 +torch >= 1.3 +datasets +scikit-learn +tensorflow diff --git a/examples/research_projects/synthid_text/utils.py b/examples/research_projects/synthid_text/utils.py new file mode 100644 index 00000000000000..abcb6ca2f28255 --- /dev/null +++ b/examples/research_projects/synthid_text/utils.py @@ -0,0 +1,408 @@ +# coding=utf-8 +# Copyright 2024 Google DeepMind. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +from typing import Any, List, Optional, Tuple + +import datasets +import numpy as np +import tensorflow as tf +import tensorflow_datasets as tfds +import torch +import tqdm +from huggingface_hub import HfApi, create_repo +from huggingface_hub.utils import RepositoryNotFoundError +from sklearn import model_selection + +import transformers + + +def pad_to_len( + arr: torch.Tensor, + target_len: int, + left_pad: bool, + eos_token: int, + device: torch.device, +) -> torch.Tensor: + """Pad or truncate array to given length.""" + if arr.shape[1] < target_len: + shape_for_ones = list(arr.shape) + shape_for_ones[1] = target_len - shape_for_ones[1] + padded = ( + torch.ones( + shape_for_ones, + device=device, + dtype=torch.long, + ) + * eos_token + ) + if not left_pad: + arr = torch.concatenate((arr, padded), dim=1) + else: + arr = torch.concatenate((padded, arr), dim=1) + else: + arr = arr[:, :target_len] + return arr + + +def filter_and_truncate( + outputs: torch.Tensor, + truncation_length: Optional[int], + eos_token_mask: torch.Tensor, +) -> torch.Tensor: + """Filter and truncate outputs to given length. + + Args: + outputs: output tensor of shape [batch_size, output_len] + truncation_length: Length to truncate the final output. + eos_token_mask: EOS token mask of shape [batch_size, output_len] + + Returns: + output tensor of shape [batch_size, truncation_length]. + """ + if truncation_length: + outputs = outputs[:, :truncation_length] + truncation_mask = torch.sum(eos_token_mask, dim=1) >= truncation_length + return outputs[truncation_mask, :] + return outputs + + +def process_outputs_for_training( + all_outputs: List[torch.Tensor], + logits_processor: transformers.generation.SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + pos_truncation_length: Optional[int], + neg_truncation_length: Optional[int], + max_length: int, + is_cv: bool, + is_pos: bool, + torch_device: torch.device, +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + """Process raw model outputs into format understandable by the detector. + + Args: + all_outputs: sequence of outputs of shape [batch_size, output_len]. + logits_processor: logits processor used for watermarking. + tokenizer: tokenizer used for the model. + pos_truncation_length: Length to truncate wm outputs. + neg_truncation_length: Length to truncate uwm outputs. + max_length: Length to pad truncated outputs so that all processed entries. + have same shape. + is_cv: Process given outputs for cross validation. + is_pos: Process given outputs for positives. + torch_device: torch device to use. + + Returns: + Tuple of + all_masks: list of masks of shape [batch_size, max_length]. + all_g_values: list of g_values of shape [batch_size, max_length, depth]. + """ + all_masks = [] + all_g_values = [] + for outputs in tqdm.tqdm(all_outputs): + # outputs is of shape [batch_size, output_len]. + # output_len can differ from batch to batch. + eos_token_mask = logits_processor.compute_eos_token_mask( + input_ids=outputs, + eos_token_id=tokenizer.eos_token_id, + ) + if is_pos or is_cv: + # filter with length for positives for both train and CV. + # We also filter for length when CV negatives are processed. + outputs = filter_and_truncate(outputs, pos_truncation_length, eos_token_mask) + elif not is_pos and not is_cv: + outputs = filter_and_truncate(outputs, neg_truncation_length, eos_token_mask) + + # If no filtered outputs skip this batch. + if outputs.shape[0] == 0: + continue + + # All outputs are padded to max-length with eos-tokens. + outputs = pad_to_len(outputs, max_length, False, tokenizer.eos_token_id, torch_device) + # outputs shape [num_filtered_entries, max_length] + + eos_token_mask = logits_processor.compute_eos_token_mask( + input_ids=outputs, + eos_token_id=tokenizer.eos_token_id, + ) + + context_repetition_mask = logits_processor.compute_context_repetition_mask( + input_ids=outputs, + ) + + # context_repetition_mask of shape [num_filtered_entries, max_length - + # (ngram_len - 1)]. + context_repetition_mask = pad_to_len(context_repetition_mask, max_length, True, 0, torch_device) + # We pad on left to get same max_length shape. + # context_repetition_mask of shape [num_filtered_entries, max_length]. + combined_mask = context_repetition_mask * eos_token_mask + + g_values = logits_processor.compute_g_values( + input_ids=outputs, + ) + + # g_values of shape [num_filtered_entries, max_length - (ngram_len - 1), + # depth]. + g_values = pad_to_len(g_values, max_length, True, 0, torch_device) + + # We pad on left to get same max_length shape. + # g_values of shape [num_filtered_entries, max_length, depth]. + all_masks.append(combined_mask) + all_g_values.append(g_values) + return all_masks, all_g_values + + +def tpr_at_fpr(detector, detector_inputs, w_true, minibatch_size, target_fpr=0.01) -> torch.Tensor: + """Calculates true positive rate (TPR) at false positive rate (FPR)=target_fpr.""" + positive_idxs = w_true == 1 + negative_idxs = w_true == 0 + num_samples = detector_inputs[0].size(0) + + w_preds = [] + for start in range(0, num_samples, minibatch_size): + end = start + minibatch_size + detector_inputs_ = ( + detector_inputs[0][start:end], + detector_inputs[1][start:end], + ) + with torch.no_grad(): + w_pred = detector(*detector_inputs_)[0] + w_preds.append(w_pred) + + w_pred = torch.cat(w_preds, dim=0) # Concatenate predictions + positive_scores = w_pred[positive_idxs] + negative_scores = w_pred[negative_idxs] + + # Calculate the FPR threshold + # Note: percentile -> quantile + fpr_threshold = torch.quantile(negative_scores, 1 - target_fpr) + # Note: need to switch to FP32 since torch.mean doesn't work with torch.bool + return torch.mean((positive_scores >= fpr_threshold).to(dtype=torch.float32)).item() # TPR + + +def update_fn_if_fpr_tpr(detector, g_values_val, mask_val, watermarked_val, minibatch_size): + """Loss function for negative TPR@FPR=1% as the validation loss.""" + tpr_ = tpr_at_fpr( + detector=detector, + detector_inputs=(g_values_val, mask_val), + w_true=watermarked_val, + minibatch_size=minibatch_size, + ) + return -tpr_ + + +def process_raw_model_outputs( + logits_processor, + tokenizer, + pos_truncation_length, + neg_truncation_length, + max_padded_length, + tokenized_wm_outputs, + test_size, + tokenized_uwm_outputs, + torch_device, +): + # Split data into train and CV + train_wm_outputs, cv_wm_outputs = model_selection.train_test_split(tokenized_wm_outputs, test_size=test_size) + + train_uwm_outputs, cv_uwm_outputs = model_selection.train_test_split(tokenized_uwm_outputs, test_size=test_size) + + process_kwargs = { + "logits_processor": logits_processor, + "tokenizer": tokenizer, + "pos_truncation_length": pos_truncation_length, + "neg_truncation_length": neg_truncation_length, + "max_length": max_padded_length, + "torch_device": torch_device, + } + + # Process both train and CV data for training + wm_masks_train, wm_g_values_train = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_wm_outputs], + is_pos=True, + is_cv=False, + **process_kwargs, + ) + wm_masks_cv, wm_g_values_cv = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_wm_outputs], + is_pos=True, + is_cv=True, + **process_kwargs, + ) + uwm_masks_train, uwm_g_values_train = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_uwm_outputs], + is_pos=False, + is_cv=False, + **process_kwargs, + ) + uwm_masks_cv, uwm_g_values_cv = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_uwm_outputs], + is_pos=False, + is_cv=True, + **process_kwargs, + ) + + # We get list of data; here we concat all together to be passed to the detector. + def pack(mask, g_values): + mask = torch.cat(mask, dim=0) + g = torch.cat(g_values, dim=0) + return mask, g + + wm_masks_train, wm_g_values_train = pack(wm_masks_train, wm_g_values_train) + # Note: Use float instead of bool. Otherwise, the entropy calculation doesn't work + wm_labels_train = torch.ones((wm_masks_train.shape[0],), dtype=torch.float, device=torch_device) + + wm_masks_cv, wm_g_values_cv = pack(wm_masks_cv, wm_g_values_cv) + wm_labels_cv = torch.ones((wm_masks_cv.shape[0],), dtype=torch.float, device=torch_device) + + uwm_masks_train, uwm_g_values_train = pack(uwm_masks_train, uwm_g_values_train) + uwm_labels_train = torch.zeros((uwm_masks_train.shape[0],), dtype=torch.float, device=torch_device) + + uwm_masks_cv, uwm_g_values_cv = pack(uwm_masks_cv, uwm_g_values_cv) + uwm_labels_cv = torch.zeros((uwm_masks_cv.shape[0],), dtype=torch.float, device=torch_device) + + # Concat pos and negatives data together. + train_g_values = torch.cat((wm_g_values_train, uwm_g_values_train), dim=0).squeeze() + train_labels = torch.cat((wm_labels_train, uwm_labels_train), axis=0).squeeze() + train_masks = torch.cat((wm_masks_train, uwm_masks_train), axis=0).squeeze() + + cv_g_values = torch.cat((wm_g_values_cv, uwm_g_values_cv), axis=0).squeeze() + cv_labels = torch.cat((wm_labels_cv, uwm_labels_cv), axis=0).squeeze() + cv_masks = torch.cat((wm_masks_cv, uwm_masks_cv), axis=0).squeeze() + + # Shuffle data. + shuffled_idx = torch.randperm(train_g_values.shape[0]) # Use torch for GPU compatibility + + train_g_values = train_g_values[shuffled_idx] + train_labels = train_labels[shuffled_idx] + train_masks = train_masks[shuffled_idx] + + # Shuffle the cross-validation data + shuffled_idx_cv = torch.randperm(cv_g_values.shape[0]) # Use torch for GPU compatibility + cv_g_values = cv_g_values[shuffled_idx_cv] + cv_labels = cv_labels[shuffled_idx_cv] + cv_masks = cv_masks[shuffled_idx_cv] + + # Del some variables so we free up GPU memory. + del ( + wm_g_values_train, + wm_labels_train, + wm_masks_train, + wm_g_values_cv, + wm_labels_cv, + wm_masks_cv, + ) + gc.collect() + torch.cuda.empty_cache() + + return train_g_values, train_masks, train_labels, cv_g_values, cv_masks, cv_labels + + +def get_tokenized_uwm_outputs(num_negatives, neg_batch_size, tokenizer, device): + dataset, info = tfds.load("wikipedia/20230601.en", split="train", with_info=True) + dataset = dataset.take(num_negatives) + + # Convert the dataset to a DataFrame + df = tfds.as_dataframe(dataset, info) + ds = tf.data.Dataset.from_tensor_slices(dict(df)) + tf.random.set_seed(0) + ds = ds.shuffle(buffer_size=10_000) + ds = ds.batch(batch_size=neg_batch_size) + + tokenized_uwm_outputs = [] + # Pad to this length (on the right) for batching. + padded_length = 1000 + for i, batch in tqdm.tqdm(enumerate(ds)): + responses = [val.decode() for val in batch["text"].numpy()] + inputs = tokenizer( + responses, + return_tensors="pt", + padding=True, + ).to(device) + inputs = inputs["input_ids"].cpu().numpy() + if inputs.shape[1] >= padded_length: + inputs = inputs[:, :padded_length] + else: + inputs = np.concatenate( + [inputs, np.ones((neg_batch_size, padded_length - inputs.shape[1])) * tokenizer.eos_token_id], axis=1 + ) + tokenized_uwm_outputs.append(inputs) + if len(tokenized_uwm_outputs) * neg_batch_size > num_negatives: + break + return tokenized_uwm_outputs + + +def get_tokenized_wm_outputs( + model, + tokenizer, + watermark_config, + num_pos_batches, + pos_batch_size, + temperature, + max_output_len, + top_k, + top_p, + device, +): + eli5_prompts = datasets.load_dataset("Pavithree/eli5") + + wm_outputs = [] + + for batch_id in tqdm.tqdm(range(num_pos_batches)): + prompts = eli5_prompts["train"]["title"][batch_id * pos_batch_size : (batch_id + 1) * pos_batch_size] + prompts = [prompt.strip('"') for prompt in prompts] + inputs = tokenizer( + prompts, + return_tensors="pt", + padding=True, + ).to(device) + _, inputs_len = inputs["input_ids"].shape + + outputs = model.generate( + **inputs, + watermarking_config=watermark_config, + do_sample=True, + max_length=inputs_len + max_output_len, + temperature=temperature, + top_k=top_k, + top_p=top_p, + ) + + wm_outputs.append(outputs[:, inputs_len:].cpu().detach()) + + del outputs, inputs, prompts + gc.collect() + + gc.collect() + torch.cuda.empty_cache() + return wm_outputs + + +def upload_model_to_hf(model, hf_repo_name: str, private: bool = True): + api = HfApi() + + # Check if the repository exists + try: + api.repo_info(repo_id=hf_repo_name, use_auth_token=True) + print(f"Repository '{hf_repo_name}' already exists.") + except RepositoryNotFoundError: + # If the repository does not exist, create it + print(f"Repository '{hf_repo_name}' not found. Creating it...") + create_repo(repo_id=hf_repo_name, private=private, use_auth_token=True) + print(f"Repository '{hf_repo_name}' created successfully.") + + # Push the model to the Hugging Face Hub + print(f"Uploading model to Hugging Face repo '{hf_repo_name}'...") + model.push_to_hub(repo_id=hf_repo_name, use_auth_token=True) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7f408859c539b0..771e3e8f0ae8b8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1301,6 +1301,8 @@ _import_structure["generation"].extend( [ "AlternatingCodebooksLogitsProcessor", + "BayesianDetectorConfig", + "BayesianDetectorModel", "BeamScorer", "BeamSearchScorer", "ClassifierFreeGuidanceLogitsProcessor", @@ -1339,6 +1341,9 @@ "StopStringCriteria", "SuppressTokensAtBeginLogitsProcessor", "SuppressTokensLogitsProcessor", + "SynthIDTextWatermarkDetector", + "SynthIDTextWatermarkingConfig", + "SynthIDTextWatermarkLogitsProcessor", "TemperatureLogitsWarper", "TopKLogitsWarper", "TopPLogitsWarper", @@ -6213,6 +6218,8 @@ ) from .generation import ( AlternatingCodebooksLogitsProcessor, + BayesianDetectorConfig, + BayesianDetectorModel, BeamScorer, BeamSearchScorer, ClassifierFreeGuidanceLogitsProcessor, @@ -6251,6 +6258,9 @@ StopStringCriteria, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index 2bea00261951c7..b487fa3c7fe6ec 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -18,7 +18,13 @@ _import_structure = { - "configuration_utils": ["GenerationConfig", "GenerationMode", "WatermarkingConfig"], + "configuration_utils": [ + "BaseWatermarkingConfig", + "GenerationConfig", + "GenerationMode", + "SynthIDTextWatermarkingConfig", + "WatermarkingConfig", + ], "streamers": ["TextIteratorStreamer", "TextStreamer"], } @@ -71,6 +77,7 @@ "SequenceBiasLogitsProcessor", "SuppressTokensLogitsProcessor", "SuppressTokensAtBeginLogitsProcessor", + "SynthIDTextWatermarkLogitsProcessor", "TemperatureLogitsWarper", "TopKLogitsWarper", "TopPLogitsWarper", @@ -110,6 +117,9 @@ _import_structure["watermarking"] = [ "WatermarkDetector", "WatermarkDetectorOutput", + "BayesianDetectorModel", + "BayesianDetectorConfig", + "SynthIDTextWatermarkDetector", ] try: @@ -179,7 +189,13 @@ ] if TYPE_CHECKING: - from .configuration_utils import GenerationConfig, GenerationMode, WatermarkingConfig + from .configuration_utils import ( + BaseWatermarkingConfig, + GenerationConfig, + GenerationMode, + SynthIDTextWatermarkingConfig, + WatermarkingConfig, + ) from .streamers import TextIteratorStreamer, TextStreamer try: @@ -217,6 +233,7 @@ SequenceBiasLogitsProcessor, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, @@ -254,6 +271,9 @@ SampleEncoderDecoderOutput, ) from .watermarking import ( + BayesianDetectorConfig, + BayesianDetectorModel, + SynthIDTextWatermarkDetector, WatermarkDetector, WatermarkDetectorOutput, ) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 37d57248c46a17..c460a19885afc5 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -18,8 +18,9 @@ import json import os import warnings +from abc import ABC, abstractmethod from dataclasses import dataclass, is_dataclass -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from .. import __version__ from ..configuration_utils import PretrainedConfig @@ -59,6 +60,7 @@ StaticCache, StaticCacheConfig, ) + from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor NEEDS_CACHE_CONFIG["quantized"] = QuantizedCacheConfig NEEDS_CACHE_CONFIG["static"] = StaticCacheConfig @@ -280,23 +282,10 @@ class GenerationConfig(PushToHubMixin): low_memory (`bool`, *optional*): Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory. Used with beam search and contrastive search. - watermarking_config (`WatermarkingConfig` or `dict`, *optional*): - Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" tokens. - If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally. - See [this paper](https://arxiv.org/abs/2306.04634) for more details. Accepts the following keys: - - greenlist_ratio (`float`): - Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. - - bias (`float`): - Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. - - hashing_key (`int`): - Hahsing key used for watermarking. Defaults to 15485863 (the millionth prime). - - seeding_scheme (`str`): - Algorithm to use for watermarking. Accepts values: - - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) - - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) - The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". - - context_width (`int`): - The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. + watermarking_config (`BaseWatermarkingConfig` or `dict`, *optional*): + Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" + tokens. See the docs of [`SynthIDTextWatermarkingConfig`] and [`WatermarkingConfig`] for more + details. If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally. > Parameters that define the output variables of generate @@ -430,7 +419,7 @@ def __init__(self, **kwargs): watermarking_config = kwargs.pop("watermarking_config", None) if watermarking_config is None: self.watermarking_config = None - elif isinstance(watermarking_config, WatermarkingConfig): + elif isinstance(watermarking_config, BaseWatermarkingConfig): self.watermarking_config = watermarking_config else: self.watermarking_config = WatermarkingConfig.from_dict(watermarking_config) @@ -766,7 +755,15 @@ def validate(self, is_init=False): # 6. check watermarking arguments if self.watermarking_config is not None: - if not isinstance(self.watermarking_config, WatermarkingConfig): + if not ( + isinstance(self.watermarking_config, WatermarkingConfig) + or isinstance(self.watermarking_config, SynthIDTextWatermarkingConfig) + ): + warnings.warn( + "`watermarking_config` as a dict is deprecated. Please construct `watermarking_config` object with " + "`WatermarkingConfig` or `SynthIDTextWatermarkingConfig` class.", + FutureWarning, + ) self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config) self.watermarking_config.validate() @@ -1287,52 +1284,20 @@ def update(self, **kwargs): @dataclass -class WatermarkingConfig: - """ - Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. - See [this paper](https://arxiv.org/abs/2306.04634) for more details on the arguments. - - Accepts the following keys: - - greenlist_ratio (`float`): - Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. - - bias (`float`): - Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. - - hashing_key (`int`): - Hashing key used for watermarking. Defaults to 15485863 (the millionth prime). - - seeding_scheme (`str`): - Algorithm to use for watermarking. Accepts values: - - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) - - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) - The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". - - context_width(`int`): - The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. - """ - - def __init__( - self, - greenlist_ratio: Optional[float] = 0.25, - bias: Optional[float] = 2.0, - hashing_key: Optional[int] = 15485863, - seeding_scheme: Optional[str] = "lefthash", - context_width: Optional[int] = 1, - ): - self.greenlist_ratio = greenlist_ratio - self.bias = bias - self.hashing_key = hashing_key - self.seeding_scheme = seeding_scheme - self.context_width = context_width +class BaseWatermarkingConfig(ABC): + """Generic watermarking config""" @classmethod def from_dict(cls, config_dict, **kwargs): """ - Constructs a WatermarkingConfig instance from a dictionary of parameters. + Constructs a BaseWatermarkingConfig instance from a dictionary of parameters. Args: config_dict (Dict[str, Any]): Dictionary containing configuration parameters. **kwargs: Additional keyword arguments to override dictionary values. Returns: - WatermarkingConfig: Instance of WatermarkingConfig constructed from the dictionary. + BaseWatermarkingConfig: Instance of BaseWatermarkingConfig constructed from the dictionary. """ config = cls(**config_dict) to_remove = [] @@ -1394,6 +1359,49 @@ def update(self, **kwargs): if hasattr(self, key): setattr(self, key, value) + @abstractmethod + def validate(self): ... + + @abstractmethod + def construct_processor(self, vocab_size): ... + + +@dataclass +class WatermarkingConfig(BaseWatermarkingConfig): + """ + Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. + See [this paper](https://arxiv.org/abs/2306.04634) for more details on the arguments. + + Accepts the following keys: + - greenlist_ratio (`float`): + Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. + - bias (`float`): + Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. + - hashing_key (`int`): + Hashing key used for watermarking. Defaults to 15485863 (the millionth prime). + - seeding_scheme (`str`): + Algorithm to use for watermarking. Accepts values: + - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) + - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) + The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". + - context_width(`int`): + The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. + """ + + def __init__( + self, + greenlist_ratio: Optional[float] = 0.25, + bias: Optional[float] = 2.0, + hashing_key: Optional[int] = 15485863, + seeding_scheme: Optional[str] = "lefthash", + context_width: Optional[int] = 1, + ): + self.greenlist_ratio = greenlist_ratio + self.bias = bias + self.hashing_key = hashing_key + self.seeding_scheme = seeding_scheme + self.context_width = context_width + def validate(self): watermark_missing_arg_msg = ( "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` " @@ -1423,3 +1431,104 @@ def validate(self): found_value=self.context_width, ), ) + + def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor": + return WatermarkLogitsProcessor( + vocab_size=vocab_size, + device=device, + greenlist_ratio=self.greenlist_ratio, + bias=self.bias, + hashing_key=self.hashing_key, + seeding_scheme=self.seeding_scheme, + context_width=self.context_width, + ) + + +@dataclass +class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): + """ + Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. + See [this paper](https://www.nature.com/articles/s41586-024-08025-4) for more details on the arguments. + + Args: + ngram_len (`int`): + Ngram length. + keys (`List[int]`): + A sequence of watermarking keys, one for each depth. + context_history_size (`int`, *optional*, defaults to 1024): + Size of the tensor to keep track of seen contexts. + sampling_table_seed (`int`, *optional*, defaults to 0): + Random seed to generate the sampling table. + sampling_table_size (`int`, *optional*, defaults to 65536): + Size of the sampling table. + skip_first_ngram_calls (`bool`, *optional*, defaults to `False`): + Whether to skip first ngram calls. + debug_mode (`bool`, optional, *optional*, defaults to `False`): + Logits are modified to uniform one got before watermarking modification is applied. This is to test the + implementation. + + Examples: + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig + + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + + >>> # SynthID Text configuration + >>> watermarking_config = SynthIDTextWatermarkingConfig( + ... keys=[654, 400, 836, 123, 340, 443, 597, 160, 57], + ... ngram_len=5, + ... ) + + >>> # Generation with watermarking + >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> output_sequences = model.generate( + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... ) + >>> watermarked_text = tokenizer.batch_decode(output_sequences) + ``` + """ + + def __init__( + self, + ngram_len: int, + keys: List[int], + context_history_size: int = 1024, + sampling_table_seed: int = 0, + sampling_table_size: int = 2**16, + skip_first_ngram_calls: bool = False, + debug_mode: bool = False, + ): + self.ngram_len = ngram_len + self.keys = keys + self.sampling_table_size = sampling_table_size + self.sampling_table_seed = sampling_table_seed + self.context_history_size = context_history_size + self.skip_first_ngram_calls = skip_first_ngram_calls + self.debug_mode = debug_mode + + def validate(self): + watermark_missing_arg_msg = ( + "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` " + "but found {found_value}" + ) + if self.sampling_table_size > 2**24: + raise ValueError( + watermark_missing_arg_msg.format( + key="sampling_table_size", + correct_value="< 2**24", + found_value=self.sampling_table_size, + ), + ) + + def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor": + return SynthIDTextWatermarkLogitsProcessor( + ngram_len=self.ngram_len, + keys=self.keys, + sampling_table_size=self.sampling_table_size, + sampling_table_seed=self.sampling_table_seed, + context_history_size=self.context_history_size, + device=device, + skip_first_ngram_calls=self.skip_first_ngram_calls, + debug_mode=self.debug_mode, + ) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index d88c7a17d892d4..fde95c7a85652f 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team +# Copyright 2024 The HuggingFace Inc. team and Google DeepMind. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -2460,6 +2460,7 @@ def _score_rejection_sampling(self, input_seq: torch.LongTensor, scores: torch.F final_greenlist.append(greedy_predictions[i]) return torch.tensor(final_greenlist, device=input_seq.device) + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if input_ids.shape[-1] < self.context_width: logger.warning( @@ -2477,3 +2478,478 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to scores_processed[b_idx, greenlist_ids] = scores_processed[b_idx, greenlist_ids] + self.bias return scores_processed + + +class SynthIDTextWatermarkState: + """SynthID watermarking state.""" + + def __init__( + self, + batch_size: int, + ngram_len: int, + context_history_size: int, + device: torch.device, + ): + """Initializes the state. + + Args: + batch_size (`int`): Batch size. + ngram_len (`int`): Ngram length. + context_history_size (`int`): Size of the tensor to keep track of seen contexts. + device (`int`): Device to use. + """ + self.context = torch.zeros( + (batch_size, ngram_len - 1), + dtype=torch.int64, + device=device, + ) + self.context_history = torch.zeros( + (batch_size, context_history_size), + dtype=torch.int64, + device=device, + ) + self.num_calls = 0 + + +class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): + r""" + Logits processor that implements watermarking techniques for text generation models. + This class facilitates the application of SynthID text watermarking, a method for embedding imperceptible signals + into generated text to aid in detecting synthetic content. It operates by subtly manipulating the probabilities of + token selection during text generation in a manner that can be reliably recovered later for verification. + + Key Features: + * **State Management:** Maintains internal state to track token sequences and generate watermarking keys + dynamically. + + * **Key Generation:** Computes hashes based on token sequences and watermarking parameters to create unique keys + for each position. + + * **G-Value Sampling:** Employs a pre-computed sampling table to sample watermarking values (g-values) based on + the generated keys. + + * **Score Adjustment:** Applies calculated g-values to modify token probabilities during generation, embedding the + watermark. + + * **Context Repetition Handling:** Incorporates logic to avoid watermarking tokens in repeated contexts, + preserving naturalness. + + * **EOS Token Masking:** Supports masking end-of-sentence tokens to prevent their inclusion in watermarking + calculations. + + * **Utility Functions:** Provides functions to compute g-values directly, check for context repetition, create + EOS token masks, and estimate expected mean g-values. + + Refer to paper url: https://www.nature.com/articles/s41586-024-08025-4 for more details around this. + + Args: + ngram_len (`int`): + Ngram length. + keys (`List[int]`): + A sequence of watermarking keys, one for each depth. + sampling_table_size (`int`): + Size of the sampling table. + sampling_table_seed (`int`): + Random seed to generate the sampling table. + context_history_size (`int`): + Size of the tensor to keep track of seen contexts. + device (`torch.device`): + Device to use. + skip_first_ngram_calls (`bool`, *optional*, defaults to `False`): + Whether to skip first ngram calls. + debug_mode (`bool`, optional, *optional*, defaults to `False`): + Logits are modified to uniform one got before watermarking modification is applied. This is to test the + implementation. + + Examples: + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig + + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + + >>> # SynthID Text configuration + >>> watermarking_config = SynthIDTextWatermarkingConfig( + ... keys=[654, 400, 836, 123, 340, 443, 597, 160, 57], + ... ngram_len=5, + ... ) + + >>> # Generation with watermarking + >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> output_sequences = model.generate( + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... ) + >>> watermarked_text = tokenizer.batch_decode(output_sequences) + ``` + """ + + def __init__( + self, + ngram_len: int, + keys: List[int], + sampling_table_size: int, + sampling_table_seed: int, + context_history_size: int, + device: torch.device, + skip_first_ngram_calls: bool = False, + debug_mode: bool = False, + ): + self.ngram_len = ngram_len + self.keys = torch.tensor(keys, device=device) + + generator = torch.Generator(device=device).manual_seed(sampling_table_seed) + # A random sampling table is pre-computed and modulo table size is applied to map from a hash of ngram keys to + # g values, this is similar to the hashtable implementation used in + # https://github.com/facebookresearch/three_bricks. We note that the hashing employed in this repository is + # different from that used to watermark the Gemini App, and hence the detectors trained based on the + # hashing in this repository will not transfer to text generated by the Gemini App. + self.sampling_table = torch.randint( + low=0, + high=2, + size=(sampling_table_size,), + generator=generator, + device=device, + ) + self.context_history_size = context_history_size + self.device = device + self.state = None + self.skip_first_ngram_calls = skip_first_ngram_calls + self.debug_mode = debug_mode + + def _init_state(self, batch_size: int): + """Initializes the state.""" + self.state = SynthIDTextWatermarkState( + batch_size=batch_size, + ngram_len=self.ngram_len, + context_history_size=self.context_history_size, + device=self.device, + ) + + def update_scores(self, scores: torch.FloatTensor, g_values: torch.FloatTensor) -> torch.FloatTensor: + """Updates scores using the g values. + + We assume that the scores are in the log space. + Args: + scores (`torch.FloatTensor`): Scores (batch_size, vocab_size). + g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth). + + Returns: + Updated scores (batch_size, vocab_size). + """ + _, _, depth = g_values.shape + + probs = torch.softmax(scores, dim=1) + + for i in range(depth): + g_values_at_depth = g_values[:, :, i] + g_mass_at_depth = (g_values_at_depth * probs).sum(axis=1, keepdims=True) + probs = probs * (1 + g_values_at_depth - g_mass_at_depth) + + log_probs = torch.log(probs) + log_probs = torch.where(torch.isfinite(log_probs), log_probs, torch.finfo(log_probs.dtype).min) + return log_probs + + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + self._check_input_ids_shape(input_ids) + batch_size, vocab_size = scores.shape + + if self.debug_mode: + scores = torch.ones_like(scores) + + # Currently indices is just a arange to compute watermarking on the desnse logits. + all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)]) + + if self.state is None: + # Initialize watermarking state if it does not exist. + self._init_state(batch_size) + else: + # Append last input id (which is the input id added in last call) to the + # previous context so we have the context to be used for current + # watermarking. + self.state.context = torch.concat( + (self.state.context, input_ids[:, -1:]), + dim=1, + ) + self.state.context = self.state.context[:, 1:] + + if self.state is None: + raise ValueError("self.state can't be None! Call `self._init_state` to initialize the state.") + + self.state.num_calls += 1 + + # Don't watermark the first ngram_len - 1 tokens if set. + if self.skip_first_ngram_calls and self.state.num_calls < self.ngram_len: + return scores + + # 2. Generate random keys for each ngram key combination. + ngram_keys, hash_result_with_just_context = self._compute_keys(self.state.context, all_indices) + # ngram_keys shape [batch_size, top_k, depth] + + # 3. Sample g values. + g_values = self.sample_g_values(ngram_keys) + # g_values shape [batch_size, top_k, depth] + + # 4. Modify scores. + updated_scores = self.update_scores(scores, g_values) + # updated scores shape [batch_size, top_k] + + # 5. Check if the current watermarking context was previously used, if yes skip watermarking. + hash_result_with_just_context = hash_result_with_just_context[:, None] + is_repeated_context = (self.state.context_history == hash_result_with_just_context).any( + dim=1, + keepdim=True, + ) + self.state.context_history = torch.concat( + (hash_result_with_just_context, self.state.context_history), + dim=1, + )[:, :-1] + + updated_watermarked_scores = torch.where( + is_repeated_context, + input=scores, + other=updated_scores, + ) + return updated_watermarked_scores + + def accumulate_hash( + self, + current_hash: torch.LongTensor, + data: torch.LongTensor, + multiplier: int = 6364136223846793005, + increment: int = 1, + ) -> torch.LongTensor: + """ + Accumulate hash of data on current hash. + + Method uses adapted linear congruential generator with newlib/musl parameters. + + This function has following property - + f(x, data[T]) = f(f(x, data[:T - 1]), data[T]) + + This function expects current_hash.shape and data.shape[:-1] to + match/broadcastable. + + Args: + current_hash (`torch.LongTensor`): + (shape,) + data (`torch.LongTensor`): + (shape, tensor_len) + multiplier (`int`, optional, *optional*, defaults to 6364136223846793005): + multiplier of linear congruential generator + increment (`int`, optional, *optional*, defaults to 1): + increment of linear congruential generator + + Returns: + updated hash (shape,) + """ + for i in range(data.shape[-1]): + current_hash = torch.add(current_hash, data[..., i]) + current_hash = torch.mul(current_hash, multiplier) + current_hash = torch.add(current_hash, increment) + return current_hash + + def compute_ngram_keys(self, ngrams: torch.LongTensor) -> torch.LongTensor: + """Computes random keys for each ngram and depth. + + Args: + ngrams (`torch.LongTensor`): + Ngrams (batch_size, num_ngrams, ngram_len). + + Returns: + ngram keys (batch_size, num_ngrams, depth). + """ + if len(ngrams.shape) != 3: + raise ValueError( + "Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}" + ) + if ngrams.shape[2] != self.ngram_len: + raise ValueError( + "Ngrams should be of shape (batch_size, num_ngrams, ngram_len)," + f" where ngram_len is {self.ngram_len}, but is {ngrams.shape}" + ) + batch_size, _, _ = ngrams.shape + + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + # hash_result shape [batch_size,] + # ngrams shape [batch_size, num_ngrams, ngram_len] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)(hash_result, ngrams) + # hash_result shape [batch_size, num_ngrams] + + keys = self.keys[None, None, :, None] + # hash_result shape [batch_size, num_ngrams] + # keys shape [1, 1, depth, 1] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys) + # hash_result shape [batch_size, num_ngrams, depth] + + return hash_result + + def _compute_keys( + self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor + ) -> Tuple[torch.LongTensor, torch.LongTensor]: + """Computes random keys for each ngram and depth. + + Args: + n_minus_1_grams (`torch.LongTensor`): + Ngrams (batch_size, ngram_len - 1). + indices (`torch.LongTensor`): + indices of the continuations (batch_size, num_indices) + + Returns: + Ngram keys (batch_size, num_indices, depth). + """ + batch_size, _ = n_minus_1_grams.shape + + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + # First hash n_minus_1 gram, for each batch entry we have a single + # n_minus_1 gram context. + # hash_result shape [batch_size] + # n_minus_1_gram shape [batch_size, ngram_len - 1] + hash_result_with_just_context = self.accumulate_hash(hash_result, n_minus_1_grams) + # hash_result shape [batch_size,] + # Indices is of shape [batch_size, num_indices], so we make it + # [batch_size, num_indices, 1] so we can vmap over num_indices dim. + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)( + hash_result_with_just_context, indices[:, :, None] + ) + # hash_result shape [batch_size, num_indices] + # Basically we have a hash for each batch entry and each indices + # Now we add watermarking keys to this hash. + # keys are of shape [depth,] + # We add batch, num_indices and data dimension to this making it + # [1, 1, depth, 1]. + # So we can vmap over the depth dimension for compute_hash + keys = self.keys[None, None, :, None] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys) + # hash_result shape should be [batch_size, num_indices, depth] + return hash_result, hash_result_with_just_context + + def sample_g_values(self, ngram_keys: torch.LongTensor) -> torch.LongTensor: + """ + Samples g values from Bernoulli distribution. + + It is not possible to pass random keys in a vectorized way in torch. Instead + we pre-compute a random sampling table, and use apply modulo table size to + map from ngram keys (int64) to g values. + + Args: + ngram_keys (`torch.LongTensor`): + Random keys (batch_size, num_ngrams, depth). + + Returns: + G values (batch_size, num_ngrams, depth). + """ + (sampling_table_size,) = self.sampling_table.shape + sampling_table = self.sampling_table.reshape((1, 1, sampling_table_size)) + ngram_keys = ngram_keys % sampling_table_size + return torch.take_along_dim(sampling_table, indices=ngram_keys, dim=2) + + def _check_input_ids_shape(self, input_ids: torch.LongTensor): + """Checks the shape of input ids.""" + if len(input_ids.shape) != 2: + raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}") + + def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Computes g values for each ngram from the given sequence of tokens. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + + Returns: + G values (batch_size, input_len - (ngram_len - 1), depth). + """ + self._check_input_ids_shape(input_ids) + ngrams = input_ids.unfold(dimension=1, size=self.ngram_len, step=1) + ngram_keys = self.compute_ngram_keys(ngrams) + return self.sample_g_values(ngram_keys) + + def compute_context_repetition_mask(self, input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Computes repetition mask. + + 0 and 1 stand for repeated and not repeated context n-1 grams respectively. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + + Returns: + Repetitions mask (batch_size, input_len - (ngram_len - 1)). + """ + self._check_input_ids_shape(input_ids) + batch_size, _ = input_ids.shape + state = SynthIDTextWatermarkState( + batch_size=batch_size, + ngram_len=self.ngram_len, + context_history_size=self.context_history_size, + device=self.device, + ) + contexts = input_ids[:, :-1].unfold( + dimension=1, + size=self.ngram_len - 1, + step=1, + ) + _, num_contexts, _ = contexts.shape + + are_repeated_contexts = [] + for i in range(num_contexts): + context = contexts[:, i, :] + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + context_hash = self.accumulate_hash(hash_result, context)[:, None] + is_repeated_context = (state.context_history == context_hash).any( + dim=1, + keepdim=True, + ) + are_repeated_contexts.append(is_repeated_context) + state.context_history = torch.concat( + (context_hash, state.context_history), + dim=1, + )[:, :-1] + are_repeated_contexts = torch.concat(are_repeated_contexts, dim=1) + + return torch.logical_not(are_repeated_contexts) + + def compute_eos_token_mask(self, input_ids: torch.LongTensor, eos_token_id: int) -> torch.LongTensor: + """ + Computes repetitions mask. + + 1 stands for ngrams that don't contain EOS tokens and vice versa. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + eos_token_id (`int`): + EOS token ID. + + Returns: + EOS token mask (batch_size, input_len). + """ + self._check_input_ids_shape(input_ids) + noneos_masks = [] + all_eos_equated = input_ids == eos_token_id + for eos_equated in all_eos_equated: + nonzero_idx = torch.nonzero(eos_equated) + noneos_mask = torch.ones_like(eos_equated) + if nonzero_idx.shape[0] != 0: + noneos_mask[nonzero_idx[0][0] :] = 0 + noneos_masks.append(noneos_mask) + return torch.stack(noneos_masks, dim=0) + + def expected_mean_g_value(self, vocab_size: int, coinflip_prob: float = 0.5) -> float: + """ + Compute expected mean g-value after watermarking, assuming uniform LM dist. + + This is the theoretical expected value for single-layer watermarking. + + Args: + vocab_size (`int`): + The size of the vocabulary. + coinflip_prob arg_name (`float`, *optional*, defaults to 0.5): + Probability of 1 in boolean prf. + + Returns: + The expected mean g-value for watermarked text. + """ + return coinflip_prob + coinflip_prob * (1 - coinflip_prob) * (1 - (1 / vocab_size)) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index c399a8a2c829c7..700ea0443f4dbd 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -92,7 +92,6 @@ TopPLogitsWarper, TypicalLogitsWarper, UnbatchedClassifierFreeGuidanceLogitsProcessor, - WatermarkLogitsProcessor, ) from .stopping_criteria import ( ConfidenceCriteria, @@ -1011,15 +1010,7 @@ def _get_logits_processor( ) if generation_config.watermarking_config is not None: processors.append( - WatermarkLogitsProcessor( - vocab_size=self.config.vocab_size, - device=device, - greenlist_ratio=generation_config.watermarking_config.greenlist_ratio, - bias=generation_config.watermarking_config.bias, - hashing_key=generation_config.watermarking_config.hashing_key, - seeding_scheme=generation_config.watermarking_config.seeding_scheme, - context_width=generation_config.watermarking_config.context_width, - ) + generation_config.watermarking_config.construct_processor(self.config.vocab_size, device) ) # TODO (joao): find a strategy to specify the order of the processors diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index e998d996ec4159..da90c03dd0da89 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team +# Copyright 2024 The HuggingFace Inc. team and Google DeepMind. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,19 +16,22 @@ import collections from dataclasses import dataclass from functools import lru_cache -from typing import Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np +import torch +from torch import nn +from torch.nn import BCELoss -from ..configuration_utils import PretrainedConfig -from ..utils import is_torch_available, logging -from .configuration_utils import WatermarkingConfig +from ..modeling_utils import PreTrainedModel +from ..utils import ModelOutput, is_torch_available, logging +from .configuration_utils import PretrainedConfig, WatermarkingConfig if is_torch_available(): import torch - from .logits_process import WatermarkLogitsProcessor + from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor logger = logging.get_logger(__name__) @@ -237,3 +240,310 @@ def __call__( confidence=confidence, ) return prediction + + +class BayesianDetectorConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`BayesianDetectorModel`]. It is used to + instantiate a Bayesian Detector model according to the specified arguments. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + watermarking_depth (`int`, *optional*): + The number of tournament layers. + base_rate (`float1`, *optional*, defaults to 0.5): + Prior probability P(w) that a text is watermarked. + """ + + def __init__(self, watermarking_depth: int = None, base_rate: float = 0.5, **kwargs): + self.watermarking_depth = watermarking_depth + self.base_rate = base_rate + # These can be set later to store information about this detector. + self.model_name = None + self.watermarking_config = None + + super().__init__(**kwargs) + + def set_detector_information(self, model_name, watermarking_config): + self.model_name = model_name + self.watermarking_config = watermarking_config + + +@dataclass +class BayesianWatermarkDetectorModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if the text is watermarked. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + posterior_probabilities (`torch.FloatTensor` of shape `(1,)`): + Multiple choice classification loss. + """ + + loss: Optional[torch.FloatTensor] = None + posterior_probabilities: Optional[torch.FloatTensor] = None + + +class BayesianDetectorWatermarkedLikelihood(nn.Module): + """Watermarked likelihood model for binary-valued g-values. + + This takes in g-values and returns p(g_values|watermarked). + """ + + def __init__(self, watermarking_depth: int): + """Initializes the model parameters.""" + super().__init__() + self.watermarking_depth = watermarking_depth + self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth)) + self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth)) + + def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Computes the unique token probability distribution given g-values. + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`): + PRF values. + + Returns: + p_one_unique_token and p_two_unique_tokens, both of shape + [batch_size, seq_len, watermarking_depth]. p_one_unique_token[i,t,l] + gives the probability of there being one unique token in a tournament + match on layer l, on timestep t, for batch item i. + p_one_unique_token[i,t,l] + p_two_unique_token[i,t,l] = 1. + """ + # Tile g-values to produce feature vectors for predicting the latents + # for each layer in the tournament; our model for the latents psi is a + # logistic regression model psi = sigmoid(delta * x + beta). + + # [batch_size, seq_len, watermarking_depth, watermarking_depth] + x = torch.repeat_interleave(torch.unsqueeze(g_values, dim=-2), self.watermarking_depth, axis=-2) + + # mask all elements above -1 diagonal for autoregressive factorization + x = torch.tril(x, diagonal=-1) + + # [batch_size, seq_len, watermarking_depth] + # (i, j, k, l) x (i, j, k, l) -> (i, j, k) einsum equivalent + logits = (self.delta[..., None, :] @ x.type(self.delta.dtype)[..., None]).squeeze() + self.beta + + p_two_unique_tokens = torch.sigmoid(logits) + p_one_unique_token = 1 - p_two_unique_tokens + return p_one_unique_token, p_two_unique_tokens + + def forward(self, g_values: torch.Tensor) -> torch.Tensor: + """Computes the likelihoods P(g_values|watermarked). + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`): + g-values (values 0 or 1) + + Returns: + p(g_values|watermarked) of shape [batch_size, seq_len, watermarking_depth]. + """ + p_one_unique_token, p_two_unique_tokens = self._compute_latents(g_values) + + # P(g_tl | watermarked) is equal to + # 0.5 * [ (g_tl+0.5) * p_two_unique_tokens + p_one_unique_token]. + return 0.5 * ((g_values + 0.5) * p_two_unique_tokens + p_one_unique_token) + + +class BayesianDetectorModel(PreTrainedModel): + r""" + Bayesian classifier for watermark detection. + + This detector uses Bayes' rule to compute a watermarking score, which is the sigmoid of the log of ratio of the + posterior probabilities P(watermarked|g_values) and P(unwatermarked|g_values). Please see the section on + BayesianScore in the paper for further details. + Paper URL: https://www.nature.com/articles/s41586-024-08025-4 + + Note that this detector only works with non-distortionary Tournament-based watermarking using the Bernoulli(0.5) + g-value distribution. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BayesianDetectorConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + """ + + config_class = BayesianDetectorConfig + base_model_prefix = "model" + + def __init__(self, config): + super().__init__(config) + + self.watermarking_depth = config.watermarking_depth + self.base_rate = config.base_rate + self.likelihood_model_watermarked = BayesianDetectorWatermarkedLikelihood( + watermarking_depth=self.watermarking_depth + ) + self.prior = torch.nn.Parameter(torch.tensor([self.base_rate])) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Parameter): + module.weight.data.normal_(mean=0.0, std=0.02) + + def _compute_posterior( + self, + likelihoods_watermarked: torch.Tensor, + likelihoods_unwatermarked: torch.Tensor, + mask: torch.Tensor, + prior: float, + ) -> torch.Tensor: + """ + Compute posterior P(w|g) given likelihoods, mask and prior. + + Args: + likelihoods_watermarked (`torch.Tensor` of shape `(batch, length, depth)`): + Likelihoods P(g_values|watermarked) of g-values under watermarked model. + likelihoods_unwatermarked (`torch.Tensor` of shape `(batch, length, depth)`): + Likelihoods P(g_values|unwatermarked) of g-values under unwatermarked model. + mask (`torch.Tensor` of shape `(batch, length)`): + A binary array indicating which g-values should be used. g-values with mask value 0 are discarded. + prior (`float`): + the prior probability P(w) that the text is watermarked. + + Returns: + Posterior probability P(watermarked|g_values), shape [batch]. + """ + mask = torch.unsqueeze(mask, dim=-1) + prior = torch.clamp(prior, min=1e-5, max=1 - 1e-5) + log_likelihoods_watermarked = torch.log(torch.clamp(likelihoods_watermarked, min=1e-30, max=float("inf"))) + log_likelihoods_unwatermarked = torch.log(torch.clamp(likelihoods_unwatermarked, min=1e-30, max=float("inf"))) + log_odds = log_likelihoods_watermarked - log_likelihoods_unwatermarked + + # Sum relative surprisals (log odds) across all token positions and layers. + relative_surprisal_likelihood = torch.einsum("i...->i", log_odds * mask) + + # Compute the relative surprisal prior + relative_surprisal_prior = torch.log(prior) - torch.log(1 - prior) + + # Combine prior and likelihood. + # [batch_size] + relative_surprisal = relative_surprisal_prior + relative_surprisal_likelihood + + # Compute the posterior probability P(w|g) = sigmoid(relative_surprisal). + return torch.sigmoid(relative_surprisal) + + def forward( + self, + g_values: torch.Tensor, + mask: torch.Tensor, + labels: Optional[torch.Tensor] = None, + loss_batch_weight=1, + return_dict=False, + ) -> BayesianWatermarkDetectorModelOutput: + """ + Computes the watermarked posterior P(watermarked|g_values). + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth, ...)`): + g-values (with values 0 or 1) + mask: + A binary array shape [batch_size, seq_len] indicating which g-values should be used. g-values with mask + value 0 are discarded. + + Returns: + p(watermarked | g_values), of shape [batch_size]. + """ + + likelihoods_watermarked = self.likelihood_model_watermarked(g_values) + likelihoods_unwatermarked = 0.5 * torch.ones_like(g_values) + out = self._compute_posterior( + likelihoods_watermarked=likelihoods_watermarked, + likelihoods_unwatermarked=likelihoods_unwatermarked, + mask=mask, + prior=self.prior, + ) + + loss = None + if labels is not None: + loss_fct = BCELoss() + loss_unwweight = torch.sum(self.likelihood_model_watermarked.delta**2) + loss_weight = loss_unwweight * loss_batch_weight + loss = loss_fct(torch.clamp(out, 1e-5, 1 - 1e-5), labels) + loss_weight + + if not return_dict: + return (out,) if loss is None else (out, loss) + + return BayesianWatermarkDetectorModelOutput(loss=loss, posterior_probabilities=out) + + +class SynthIDTextWatermarkDetector: + r""" + SynthID text watermark detector class. + + This class has to be initialized with the trained bayesian detector module check script + in examples/synthid_text/detector_training.py for example in training/saving/loading this + detector module. The folder also showcases example use case of this detector. + + Parameters: + detector_module ([`BayesianDetectorModel`]): + Bayesian detector module object initialized with parameters. + Check examples/research_projects/synthid_text/detector_training.py for usage. + logits_processor (`SynthIDTextWatermarkLogitsProcessor`): + The logits processor used for watermarking. + tokenizer (`Any`): + The tokenizer used for the model. + + Examples: + ```python + >>> from transformers import ( + ... AutoTokenizer, BayesianDetectorModel, SynthIDTextWatermarkLogitsProcessor, SynthIDTextWatermarkDetector + ... ) + + >>> # Load the detector. See examples/research_projects/synthid_text for training a detector. + >>> detector_model = BayesianDetectorModel.from_pretrained("joaogante/dummy_synthid_detector") + >>> logits_processor = SynthIDTextWatermarkLogitsProcessor( + ... **detector_model.config.watermarking_config, device="cpu" + ... ) + >>> tokenizer = AutoTokenizer.from_pretrained(detector_model.config.model_name) + >>> detector = SynthIDTextWatermarkDetector(detector_model, logits_processor, tokenizer) + + >>> # Test whether a certain string is watermarked + >>> test_input = tokenizer(["This is a test input"], return_tensors="pt") + >>> is_watermarked = detector(test_input.input_ids) + ``` + """ + + def __init__( + self, + detector_module: BayesianDetectorModel, + logits_processor: SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + ): + self.detector_module = detector_module + self.logits_processor = logits_processor + self.tokenizer = tokenizer + + def __call__(self, tokenized_outputs: torch.Tensor): + # eos mask is computed, skip first ngram_len - 1 tokens + # eos_mask will be of shape [batch_size, output_len] + eos_token_mask = self.logits_processor.compute_eos_token_mask( + input_ids=tokenized_outputs, + eos_token_id=self.tokenizer.eos_token_id, + )[:, self.logits_processor.ngram_len - 1 :] + + # context repetition mask is computed + context_repetition_mask = self.logits_processor.compute_context_repetition_mask( + input_ids=tokenized_outputs, + ) + # context repitition mask shape [batch_size, output_len - (ngram_len - 1)] + + combined_mask = context_repetition_mask * eos_token_mask + + g_values = self.logits_processor.compute_g_values( + input_ids=tokenized_outputs, + ) + # g values shape [batch_size, output_len - (ngram_len - 1), depth] + return self.detector_module(g_values, combined_mask) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index e109ea659c74e0..36e1ff2cfe65c4 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -191,6 +191,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class BayesianDetectorConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BayesianDetectorModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class BeamScorer(metaclass=DummyObject): _backends = ["torch"] @@ -457,6 +471,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class SynthIDTextWatermarkDetector(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkingConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkLogitsProcessor(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class TemperatureLogitsWarper(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py index a5d3ab37efa51e..aeebb5c4c53d24 100644 --- a/tests/generation/test_logits_process.py +++ b/tests/generation/test_logits_process.py @@ -16,6 +16,7 @@ import unittest from typing import List, Union +import numpy as np from parameterized import parameterized from transformers import is_torch_available @@ -48,6 +49,7 @@ PrefixConstrainedLogitsProcessor, RepetitionPenaltyLogitsProcessor, SequenceBiasLogitsProcessor, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, @@ -975,3 +977,187 @@ def test_watermarking_processor(self): scores_wo_bias = scores[:, -1].clone() out = watermark(input_ids=input_ids, scores=scores) self.assertTrue((out[:, 1] == scores_wo_bias + watermark.bias).all()) + + @parameterized.expand([(5, 3, 10000), (10, 5, 1000)]) + def test_synthidtext_watermarking_processor_bias_uniformity(self, ngram_len, num_layers, vocab_size): + """Test SynthID watermarked distribution bias uniformity over iterations.""" + torch.manual_seed(0) + np.random.seed(0) + watermarking_config = { + "ngram_len": ngram_len, + "keys": np.random.randint(low=0, high=2**16, size=(num_layers,)), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 512, + "device": torch_device, + } + batch_size = 100000 + ngrams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, ngram_len), + device=torch_device, + ) + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + g_values = logits_processor.compute_g_values(ngrams) + g_values_mean = torch.mean(torch.mean(g_values.float(), dim=0)) + self.assertAlmostEqual(g_values_mean, 0.5, delta=0.01) + + @parameterized.expand([(10000, 3), (1000, 20)]) + def test_synthidtext_watermark_processor_bias_uniformity_across_vocab(self, vocab_size, num_layers): + """Test SynthID watermarked distribution bias uniformity over vocabs of the model.""" + batch_size = 1000 + ngram_len = 5 + torch.manual_seed(0) + np.random.seed(0) + watermarking_config = { + "ngram_len": ngram_len, + "keys": np.random.randint(low=0, high=2**16, size=(num_layers,)), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 512, + "device": torch_device, + } + n_minus_1_grams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, watermarking_config["ngram_len"] - 1), + device=torch_device, + ) + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + ngram_keys, _ = logits_processor._compute_keys( + n_minus_1_grams, + torch.stack([torch.arange(vocab_size, device=torch_device) for _ in range(batch_size)]), + ) + + g_values = logits_processor.sample_g_values(ngram_keys) + # g_values shape should be [batch_size, vocab_size, num_layers] + g_values_mean = torch.mean(torch.mean(g_values.float(), dim=1)) + self.assertAlmostEqual(g_values_mean, 0.5, delta=0.001) + + @parameterized.expand([(2, "uniform"), (10, "uniform"), (2, "random"), (10, "random")]) + def test_synthidtext_watermark_processor_distributional_convergence(self, vocab_size, logits_type): + """Check if watermarked distribution converges to unwatermarked logits distribution.""" + batch_size = 1500 + num_keys = 1000 + + updated_softmaxes = 0 + np.random.seed(0) + torch.manual_seed(0) + if logits_type == "uniform": + fixed_logits = torch.ones((batch_size, vocab_size), device=torch_device) + elif logits_type == "random": + fixed_logits = torch.rand( + ( + 1, + vocab_size, + ), + device=torch_device, + ) + fixed_logits = fixed_logits.repeat(batch_size, 1) + else: + raise ValueError(f"Unrecognized logits_type {logits_type}") + for _ in range(num_keys): + watermarking_config = { + "ngram_len": 5, + "keys": np.random.randint(0, 10**9, size=(1,), dtype=np.int64), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 1024, + "device": torch_device, + } + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + + ngrams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, watermarking_config["ngram_len"]), + device=torch_device, + ) + + # Insert ngram-1 into logit_processor state. + for idx in range(watermarking_config["ngram_len"] - 1): + _ = logits_processor(ngrams[:, :idx], fixed_logits) + + updated_scores = logits_processor(ngrams, fixed_logits) + updated_softmaxes += torch.nn.functional.softmax(updated_scores, dim=1).cpu().numpy() + + updated_softmaxes = np.mean(updated_softmaxes, axis=0) / num_keys + is_close = torch.all( + torch.isclose( + torch.tensor(updated_softmaxes, device=torch_device), + torch.nn.Softmax()(fixed_logits[0]), # Take any batch entry, all are same. + atol=1e-3, + rtol=0, + ) + ) + self.assertTrue(is_close) + + @parameterized.expand([(2, 10, 1, 0.01), (100, 5, 1, 0.01), (100, 10, 2, 0.02)]) + def test_synthidtext_watermark_processor_bias_test(self, vocab_size, ngram_len, num_layers, atol): + """Test SynthID watermarking bias matches theoretical value.""" + batch_size = 20000 + generator = torch.Generator(device=torch_device).manual_seed(0) + np.random.seed(0) + + keys = [np.random.randint(0, 10**9) for _ in range(num_layers)] + # Use 10**9 rather than vocab_size to ensure variety in (n-1)-grams. + context = torch.randint( + low=0, + high=10**9, + size=(batch_size, ngram_len - 1), + dtype=torch.int64, + generator=generator, + device=torch_device, + ) + + context_history_size = 1024 + logits_processor = SynthIDTextWatermarkLogitsProcessor( + ngram_len=ngram_len, + keys=keys, + sampling_table_size=2**16, + sampling_table_seed=0, + context_history_size=context_history_size, + device=torch_device, + ) + + scores = torch.ones( + (batch_size, vocab_size), + dtype=torch.float64, + device=torch_device, + ) + # Init state of the logits processor. + logits_processor(context, scores) + # insert context into the state. + for idx in range(1, ngram_len - 1): + _ = logits_processor(context[:, :idx], scores) + + updated_scores = logits_processor(context, scores) + + probs = torch.nn.functional.softmax(updated_scores, dim=1) + generator = torch.Generator(device=torch_device).manual_seed(0) + next_tokens = torch.multinomial( + probs, + num_samples=1, + generator=generator, + ) + + ngrams = torch.concat((context, next_tokens), dim=1) + g_values = logits_processor.compute_g_values(ngrams) + mean_g_values = g_values.mean(dtype=torch.float64, dim=(0, 1)) + + expected_mean_g_value = logits_processor.expected_mean_g_value( + vocab_size=vocab_size, + ) + is_close = torch.all( + torch.isclose( + mean_g_values, + torch.tensor(expected_mean_g_value, dtype=torch.float64, device=torch_device), + atol=atol, + rtol=0, + ) + ) + self.assertTrue(is_close) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 996d95eb80ff9b..4e5d8f30265995 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -84,6 +84,7 @@ SampleEncoderDecoderOutput, StoppingCriteria, StoppingCriteriaList, + SynthIDTextWatermarkingConfig, WatermarkDetector, WatermarkingConfig, ) @@ -2517,9 +2518,9 @@ def test_beam_search_low_memory(self): self.assertListEqual(low_output.tolist(), high_output.tolist()) @slow - def test_watermark_generation(self): - tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") - model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(torch_device) + def test_green_red_watermark_generation(self): + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") tokenizer.pad_token_id = tokenizer.eos_token_id model_inputs = tokenizer("I will be", return_tensors="pt").to(torch_device) input_len = model_inputs["input_ids"].shape[-1] @@ -2548,6 +2549,61 @@ def test_watermark_generation(self): self.assertListEqual(detection_out_watermarked.prediction.tolist(), [True]) self.assertListEqual(detection_out.prediction.tolist(), [False]) + """Check the mean bias inserted by the watermarking algorithm.""" + + @slow + def test_synthid_text_watermark_generation_mean_expected_bias(self): + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + tokenizer.pad_token_id = tokenizer.eos_token_id + model_inputs = tokenizer("I will be", return_tensors="pt").to(torch_device) + input_len = 5 + batch_size = 200 + + # generation should work with both input types: WatermarkingConfig or Dict, so let's check it here :) + watermark_config = SynthIDTextWatermarkingConfig(keys=[10, 20], ngram_len=5, debug_mode=True) + logits_processor = watermark_config.construct_processor(model.config.vocab_size, torch_device) + mean_g_values_repeats = [] + for _ in range(40): + input_ids = torch.zeros( + (batch_size, input_len), + dtype=torch.int64, + device=torch_device, + ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": torch.ones_like(input_ids, device=torch_device), + } + output = model.generate( + **model_inputs, watermarking_config=watermark_config, do_sample=True, max_length=500, top_k=1000 + ) + g_values = logits_processor.compute_g_values(input_ids=output[:, input_len:]) + context_repetition_mask = logits_processor.compute_context_repetition_mask( + input_ids=output[:, input_len:], + ).unsqueeze(dim=2) + + mean_g_values = torch.masked.mean( + g_values, + mask=context_repetition_mask, + dim=0, + keepdim=True, + dtype=torch.float64, + ) + mean_g_values_repeats.append(mean_g_values) + + mean_g_values = torch.concat(mean_g_values_repeats, dim=0).mean(dim=0) + expected_mean_g_value = logits_processor.expected_mean_g_value( + vocab_size=model.config.vocab_size, + ) + atol = 0.03 + is_close = torch.isclose( + mean_g_values, + torch.tensor(expected_mean_g_value, dtype=torch.float64), + atol=atol, + rtol=0, + ) + self.assertTrue(torch.all(is_close)) + @slow def test_beam_search_example_integration(self): # PT-only test: TF doesn't have a BeamSearchScorer From 65753d6065e4d6e79199c923494edbf0d6248fb1 Mon Sep 17 00:00:00 2001 From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com> Date: Thu, 24 Oct 2024 05:02:54 -0400 Subject: [PATCH 088/385] Remove graph breaks for torch.compile() in flash_attention_forward when Lllama Model is padding free tuned (#33932) * fix: fixes for graph breaks Signed-off-by: Abhishek * fix: formatting Signed-off-by: Abhishek * fix: import error Signed-off-by: Abhishek * fix: Add Fa2Kwargs Signed-off-by: Abhishek * fix: PR Changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * Revert "PR changes" This reverts commit 39d2868e5c93cc5f3f3c7c6ff981b66614c0e0e4. * PR changes Signed-off-by: Abhishek * fix: FlashAttentionKwarg Signed-off-by: Abhishek * fix: FlashAttentionKwarg Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * addition of documentation Signed-off-by: Abhishek * change in _flash_attention_forward Signed-off-by: Abhishek * make fix-copies Signed-off-by: Abhishek * revert make fix-copies Signed-off-by: Abhishek * fix copies * style * loss kwargs typing * style and pull latest changes --------- Signed-off-by: Abhishek Co-authored-by: Arthur Zucker --- docs/source/en/llm_optims.md | 93 +++++++++++++++++++ .../modeling_flash_attention_utils.py | 65 ++++++++++--- .../models/cohere/modeling_cohere.py | 4 + src/transformers/models/glm/modeling_glm.py | 14 ++- src/transformers/models/glm/modular_glm.py | 2 + .../models/llama/modeling_llama.py | 16 +++- src/transformers/tokenization_utils_base.py | 2 +- src/transformers/utils/__init__.py | 1 + src/transformers/utils/generic.py | 15 ++- 9 files changed, 192 insertions(+), 20 deletions(-) diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 16be638498dfd4..0a6a7e15bea081 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -348,6 +348,99 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` +### Fine-Tuning with torch.compile and Padding-Free Data Collation + +In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead. + +Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator: + +``` +#################### IMPORTS ################### + +import math +import datasets +import dataclasses +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + TrainingArguments +) +from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM + +#################### MODEL LOADING WITH FLASH ATTENTION ################### + +model_name = "meta-llama/Llama-3.2-1B" +model = AutoModelForCausalLM.from_pretrained( + model_name, + attn_implementation="flash_attention_2" # Enables FlashAttention-2 +) +tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + +#################### DATA PREPROCESSING (PADDING-FREE) ################### + +response_template = "\n### Label:" +response_template_ids = tokenizer.encode( + response_template, add_special_tokens=False +)[2:] # Exclude special tokens + +data_collator = DataCollatorForCompletionOnlyLM( + response_template_ids=response_template_ids, + tokenizer=tokenizer, + ignore_index=-100, + padding_free=True # Enables padding-free collation +) + +def format_dataset(example): + return { + "output": example["output"] + tokenizer.eos_token + } + +data_files = {"train": "path/to/dataset"} # Replace with your dataset path +json_dataset = datasets.load_dataset("json", data_files=data_files) +formatted_train_dataset = json_dataset["train"].map(format_dataset) + +################# TRAINING CONFIGURATION ############################ + +train_args = TrainingArguments( + num_train_epochs=5, + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + gradient_accumulation_steps=4, + learning_rate=1e-5, + weight_decay=0.0, + warmup_ratio=0.03, + lr_scheduler_type="cosine", + logging_steps=1, + include_tokens_per_second=True, + save_strategy="epoch", + output_dir="output", + torch_compile=True, # Enables torch.compile + torch_compile_backend="inductor", + torch_compile_mode="default" +) + +# Convert TrainingArguments to SFTConfig +transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)] +transformer_kwargs = { + k: v + for k, v in train_args.to_dict().items() + if k in transformer_train_arg_fields +} +training_args = SFTConfig(**transformer_kwargs) + +####################### FINE-TUNING ##################### + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=formatted_train_dataset, + data_collator=data_collator, + dataset_text_field="output", + args=training_args, +) +trainer.train() +``` + ### PyTorch scaled dot product attention Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation. diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index da961c6060e499..045d2f6d646010 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -15,7 +15,7 @@ import inspect import os -from typing import Optional, Tuple +from typing import Optional, Tuple, TypedDict import torch import torch.nn.functional as F @@ -180,6 +180,10 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids): return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) +flash_241 = is_flash_attn_greater_or_equal("2.4.1") +deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" + + def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, @@ -194,6 +198,10 @@ def _flash_attention_forward( use_top_left_mask: bool = False, softcap: Optional[float] = None, deterministic: bool = None, + cu_seq_lens_q: Optional[torch.LongTensor] = None, + cu_seq_lens_k: Optional[torch.LongTensor] = None, + max_length_q: Optional[int] = None, + max_length_k: Optional[int] = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -232,9 +240,9 @@ def _flash_attention_forward( ) flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} - if is_flash_attn_greater_or_equal("2.4.1"): + if flash_241: if deterministic is None: - deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" + deterministic = deterministic_g flash_kwargs["deterministic"] = deterministic if softcap is not None: @@ -267,24 +275,32 @@ def _flash_attention_forward( # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage. # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach - # Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always) - elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all(): + elif position_ids is not None and ( + max_length_q is not None or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all()) + ): batch_size = query_states.size(0) - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids( - query_states, key_states, value_states, position_ids - ) - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + if cu_seq_lens_q is None or cu_seq_lens_k is None: + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = ( + prepare_fa2_from_position_ids(query_states, key_states, value_states, position_ids) + ) + + cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens + max_length_q, max_length_k = max_seq_lens + + else: + query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1)) + key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1)) + value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1)) attn_output = flash_attn_varlen_func( query_states, key_states, value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, + cu_seqlens_q=cu_seq_lens_q, + cu_seqlens_k=cu_seq_lens_k, + max_seqlen_q=max_length_q, + max_seqlen_k=max_length_k, dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, @@ -299,3 +315,24 @@ def _flash_attention_forward( ) return attn_output + + +class FlashAttentionKwargs(TypedDict, total=False): + """ + Keyword arguments for Flash Attention with Compile. + + Attributes: + cu_seq_lens_q (`torch.LongTensor`, *optional*) + Gets cumlative sequence length for query state. + cu_seq_lens_k (`torch.LongTensor`, *optional*) + Gets cumlative sequence length for key state. + max_length_q (`int`, *optional*): + Maximum sequence length for query state. + max_length_k (`int`, *optional*): + Maximum sequence length for key state. + """ + + cu_seq_lens_q: Optional[torch.LongTensor] + cu_seq_lens_k: Optional[torch.LongTensor] + max_length_q: Optional[int] + max_length_k: Optional[int] diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 9aa588be431029..b215fb6561bf81 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -33,12 +33,14 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( add_start_docstrings, @@ -832,6 +834,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -913,6 +916,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index aad4da282b7878..6354e20e33fe8c 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -38,6 +38,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -51,7 +52,11 @@ if is_flash_attn_2_available(): from ...modeling_flash_attention_utils import _flash_attention_forward -from ...modeling_flash_attention_utils import _flash_attention_forward +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...processing_utils import Unpack + + +_CHECKPOINT_FOR_DOC = "dummy" class GlmRMSNorm(nn.Module): @@ -736,6 +741,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -817,6 +823,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] @@ -1222,6 +1229,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 55bf89d1c56b28..c26477fdc173b1 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -46,6 +46,8 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "dummy" + class GlmRMSNorm(Phi3RMSNorm): pass diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 617ef38e4ae3de..4d95f01849d678 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -29,7 +29,7 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import _flash_attention_forward +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -39,8 +39,10 @@ ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( + LossKwargs, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, @@ -422,6 +424,7 @@ def forward( use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -506,6 +509,7 @@ def forward( sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, + **kwargs, ) attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() @@ -870,6 +874,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -951,6 +956,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] @@ -1102,6 +1108,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask +class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... + + class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] @@ -1148,7 +1157,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, - **loss_kwargs, + **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1198,6 +1207,7 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, cache_position=cache_position, + **kwargs, ) hidden_states = outputs[0] @@ -1211,7 +1221,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **loss_kwargs) + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 16c05a14028eee..4f3187d510fad1 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -815,7 +815,7 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) for k, v in self.data.items() if isinstance(v, torch.Tensor)} + self.data = {k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()} else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index a781389c2fbdc8..2a10bcaa3c9412 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -37,6 +37,7 @@ from .generic import ( ContextManagers, ExplicitEnum, + LossKwargs, ModelOutput, PaddingStrategy, TensorType, diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index a5f01fa2e0df8a..26ec82b20fd40e 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -24,7 +24,7 @@ from dataclasses import fields, is_dataclass from enum import Enum from functools import partial, wraps -from typing import Any, ContextManager, Iterable, List, Optional, Tuple +from typing import Any, ContextManager, Iterable, List, Optional, Tuple, TypedDict import numpy as np from packaging import version @@ -854,3 +854,16 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +class LossKwargs(TypedDict, total=False): + """ + Keyword arguments to be passed to the loss function + + Attributes: + num_items_in_batch (`int`, *optional*): + Number of items in the batch. It is recommended to pass it when + you are doing gradient accumulation. + """ + + num_items_in_batch: Optional[int] From 05863817d6bdf8183f9acc33c21b7e4fc026005a Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:11:55 +0200 Subject: [PATCH 089/385] Better defaults (#34026) * be nice to our usres * nit * fixup * default to -1 * oups * turbo nit * auto infer framework --- src/transformers/generation/utils.py | 2 ++ src/transformers/pipelines/base.py | 17 ++++------------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 700ea0443f4dbd..3938457155d83f 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1440,6 +1440,8 @@ def _prepare_generated_length( and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] + else: # by default let's always generate 10 new tokens + generation_config.max_length = generation_config.max_length + input_ids_length # same for min length if generation_config.min_new_tokens is not None: diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 042958cbb0c6b3..25c2a11564c3f1 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -881,18 +881,7 @@ def __init__( # Take the first device used by `accelerate`. device = next(iter(hf_device_map.values())) else: - device = -1 - if ( - is_torch_mlu_available() - or is_torch_cuda_available() - or is_torch_npu_available() - or is_torch_xpu_available(check_device=True) - or is_torch_mps_available() - ): - logger.warning( - "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument" - " is passed to the `Pipeline` object. Model will be on CPU." - ) + device = 0 if is_torch_available() and self.framework == "pt": if device == -1 and self.model.device is not None: @@ -920,10 +909,12 @@ def __init__( elif is_torch_mps_available(): self.device = torch.device(f"mps:{device}") else: - raise ValueError(f"{device} unrecognized or not available.") + self.device = torch.device("cpu") else: self.device = device if device is not None else -1 + logger.warning(f"Device set to use {self.device}") + self.binary_output = binary_output # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device if ( From f0e640adfa3cedea53912b95e3093f05cc2b66b5 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:16:55 +0200 Subject: [PATCH 090/385] Drop support for Python 3.8 (#34314) * drop python 3.8 * update docker files --------- Co-authored-by: ydshieh --- CONTRIBUTING.md | 2 +- README.md | 2 +- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-pytorch-gpu/Dockerfile | 2 +- docker/transformers-quantization-latest-gpu/Dockerfile | 8 ++++---- docker/transformers-tensorflow-gpu/Dockerfile | 2 +- docs/source/de/contributing.md | 2 +- docs/source/ko/contributing.md | 2 +- docs/source/zh/contributing.md | 2 +- setup.py | 5 ++--- src/transformers/dependency_versions_table.py | 2 +- 11 files changed, 15 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d62a44ab250d5..9eeea997154085 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -132,7 +132,7 @@ You will need basic `git` proficiency to contribute to manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference. -You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: +You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: 1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code diff --git a/README.md b/README.md index 68e2a215d4cdd6..c748e675066202 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. +This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 08e37ea6e1292f..93f9b6f6a170fd 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 2c1f153eef275e..62578ad0f3610f 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 0617ac8cdd779c..53e66662f9ee99 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,12 +9,12 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.2.1' +ARG PYTORCH='2.4.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main @@ -53,7 +53,7 @@ RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing # >=v0.2.3 needed for compatibility with torch 2.2.1 -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index adccee1ace4998..d765767780f46c 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docs/source/de/contributing.md b/docs/source/de/contributing.md index 4c0e131a352242..d014dd67c83aac 100644 --- a/docs/source/de/contributing.md +++ b/docs/source/de/contributing.md @@ -112,7 +112,7 @@ Bevor Sie irgendwelchen Code schreiben, empfehlen wir Ihnen dringend, die besteh Sie benötigen grundlegende `git`-Kenntnisse, um zu 🤗 Transformers beizutragen. Obwohl `git` nicht das einfachste Werkzeug ist, hat es ein sehr gutes Handbuch. Geben Sie `git --help` in eine Shell ein und genießen Sie es! Wenn Sie Bücher bevorzugen, ist [Pro Git](https://git-scm.com/book/en/v2) eine gute Anlaufstelle. -Sie benötigen **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen: +Sie benötigen **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen: 1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf den **[Fork](https://github.com/huggingface/transformers/fork)**-Button auf der Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes auf Ihrem GitHub-Account erstellt. diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md index f5003eff07c02e..99f1d2b6664728 100644 --- a/docs/source/ko/contributing.md +++ b/docs/source/ko/contributing.md @@ -113,7 +113,7 @@ python src/transformers/commands/transformers_cli.py env 🤗 Transformers에 기여하기 위해서는 기본적인 `git` 사용 능력이 필요합니다. `git`은 사용하기 쉬운 도구는 아니지만, 매우 훌륭한 매뉴얼을 제공합니다. 쉘(shell)에서 `git --help`을 입력하여 확인해보세요! 만약 책을 선호한다면, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료가 될 것입니다. -🤗 Transformers에 기여하려면 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요: +🤗 Transformers에 기여하려면 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요: 1. 저장소 페이지에서 **[Fork](https://github.com/huggingface/transformers/fork)** 버튼을 클릭하여 저장소를 포크하세요. 이렇게 하면 코드의 복사본이 여러분의 GitHub 사용자 계정 아래에 생성됩니다. diff --git a/docs/source/zh/contributing.md b/docs/source/zh/contributing.md index 9c247a60a148c8..b525754359babc 100644 --- a/docs/source/zh/contributing.md +++ b/docs/source/zh/contributing.md @@ -112,7 +112,7 @@ python src/transformers/commands/transformers_cli.py env 要为 🤗 Transformers 做贡献,你需要基本的 `git` 使用技能。虽然 `git` 不是一个很容易使用的工具,但它提供了非常全面的手册,在命令行中输入 `git --help` 并享受吧!如果你更喜欢书籍,[Pro Git](https://git-scm.com/book/en/v2)是一本很好的参考书。 -要为 🤗 Transformers 做贡献,你需要 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献: +要为 🤗 Transformers 做贡献,你需要 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献: 1. 点击[仓库](https://github.com/huggingface/transformers)页面上的 **[Fork](https://github.com/huggingface/transformers/fork)** 按钮,这会在你的 GitHub 账号下拷贝一份代码。 diff --git a/setup.py b/setup.py index 1846f7bf97b5d4..f4028d13c44e91 100644 --- a/setup.py +++ b/setup.py @@ -150,7 +150,7 @@ "pytest>=7.2.0,<8.0.0", "pytest-timeout", "pytest-xdist", - "python>=3.8.0", + "python>=3.9.0", "ray[tune]>=2.7.0", "regex!=2019.12.17", "requests", @@ -451,7 +451,7 @@ def run(self): zip_safe=False, extras_require=extras, entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]}, - python_requires=">=3.8.0", + python_requires=">=3.9.0", install_requires=list(install_requires), classifiers=[ "Development Status :: 5 - Production/Stable", @@ -461,7 +461,6 @@ def run(self): "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering :: Artificial Intelligence", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 5ce23f4b7647d5..a633f54a4af1a8 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -56,7 +56,7 @@ "pytest": "pytest>=7.2.0,<8.0.0", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", - "python": "python>=3.8.0", + "python": "python>=3.9.0", "ray[tune]": "ray[tune]>=2.7.0", "regex": "regex!=2019.12.17", "requests": "requests", From 9643069465ff63191da97ddc459813d129308818 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 24 Oct 2024 11:23:29 +0200 Subject: [PATCH 091/385] v4.47.0.dev0 --- examples/flax/question-answering/run_qa.py | 2 +- .../speech-recognition/run_flax_speech_recognition_seq2seq.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../pytorch/audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../pytorch/image-classification/run_image_classification.py | 2 +- .../image-classification/run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- .../pytorch/instance-segmentation/run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/object-detection/run_object_detection.py | 2 +- .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../pytorch/semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_ctc_adapter.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- examples/pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../tensorflow/image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index f72aa0df1ff256..25a8706d869bcf 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 361ab4aa54f7a7..c0085c9f4bb88c 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 092db16c987a8f..9ffbb82cd3aae6 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index dd801456da2a91..9ffaade205611c 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 009a1f6372433a..cfbc4d83d93c49 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index cab047ae0cb1b0..3bed494b75c608 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 0a9789426c2c46..aa1cd089ef5c7f 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 0866cb0f8323cd..2c60b359bd106b 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 46863cbbf1ce3e..90b30c60e78411 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 3912c693440192..773038f445cc40 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 5db5f55730fa7f..5f38481db2315c 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index aeb78f95d28878..368296709f699a 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index 75b74d17d9d236..d8bb9d6f235e61 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 656571eb37e40e..d3f8ad8da97f3c 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index e40a7bb265b855..15538b2ef2e302 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 154fc1518384e4..9d0e0008839d99 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index b06aad86629fac..0af6d61107db66 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index d021318ae065d9..4b615fdc4cf1d5 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 8961ee93d312f0..13a1f7a0d86231 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 0a207b80479ce2..1c2b7ecf9905ef 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index ac5db5f6b02727..ea6c4a0e317eec 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 49436fefd1d7f4..2f6723907957f5 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 0aea1a11c14ca9..9111874438648a 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 23420205a9f70b..f312d0ce8a1f1a 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index bb0a6455926197..3159a79c7e5525 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index b3d9ee1e9c7934..2fc71e0666be23 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 68dbdf0d6c11e9..3b7d607933c38a 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index d8dfb3ec3502fc..a8f1fc10b9c540 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 7cf50cf94a03b0..b0bcb940e5186f 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 4c119dcbb4a450..46f2fa45a246cc 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 2787a228134f7d..a0ce4d0f75c639 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index ff5da5ed49ad68..78c798fd471309 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 66a75ca5d09269..4d9bb7780420d8 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 8740ec5f88fa65..aa03dacd981d6e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 9a25d944053ee2..9c4c2ac13d44c2 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ad6abc7df3eda2..3d38e35aac59e2 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index a440a48110aa41..e7a186836fb82c 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 4284fdf12f80a2..90acf81a36a408 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 3f18d974a96dbc..7fcdf81fa861ed 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 6578e96dc9c585..b058b6f74fdc5c 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index d2a4c3dabfd63c..c8cb098e344b0a 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 597da1d9d666b5..0646af80bdc71d 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 4e164010185ea5..ea37b9c51e6769 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 4f896dff21c6b9..ba1f15dd83edd4 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 68728003fc30d8..20a01a46f21a59 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index e87d6b2cacc2df..78655e7d6bc3d5 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 389d633854eea1..cbd4400580d9d4 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 480330122b6f5a..f9c6de0e42bec2 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 428c34599031b0..92ebd0e1d77533 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index c051e27e504e6b..a51939d8d58801 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index bc37685b66da2b..50189345d56890 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index f4028d13c44e91..cbfcfd43428524 100644 --- a/setup.py +++ b/setup.py @@ -435,7 +435,7 @@ def run(self): setup( name="transformers", - version="4.46.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.47.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 771e3e8f0ae8b8..cc8b07395024a8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.46.0.dev0" +__version__ = "4.47.0.dev0" from typing import TYPE_CHECKING From f0b3ef9e2e6a76bd22091502899091b47ce7e930 Mon Sep 17 00:00:00 2001 From: blueingman <15329507600@163.com> Date: Thu, 24 Oct 2024 17:47:58 +0800 Subject: [PATCH 092/385] translated gguf.md into chinese (#34163) * translated gguf.md into chinese * Apply suggestions from code review I have updated the PR accordingly.Thank you very much for detailed guidance,and I 'll pay more attention to the details next time. Co-authored-by: Isotr0py <2037008807@qq.com> * Apply suggestions from code review Co-authored-by: Isotr0py <2037008807@qq.com> --------- Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/zh/_toctree.yml | 2 + docs/source/zh/gguf.md | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 docs/source/zh/gguf.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index fe966bdbfcf943..07c97e51550cb7 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -50,6 +50,8 @@ title: 导出为 TFLite - local: torchscript title: 导出为 TorchScript + - local: gguf + title: 与 GGUF 格式的互操作性 title: 开发者指南 - sections: - local: performance diff --git a/docs/source/zh/gguf.md b/docs/source/zh/gguf.md new file mode 100644 index 00000000000000..3da64a5d9956a1 --- /dev/null +++ b/docs/source/zh/gguf.md @@ -0,0 +1,104 @@ + + +# GGUF 和 Transformers 的交互 + +GGUF文件格式用于存储模型,以便通过[GGML](https://github.com/ggerganov/ggml)和其他依赖它的库进行推理,例如非常流行的[llama.cpp](https://github.com/ggerganov/llama.cpp)或[whisper.cpp](https://github.com/ggerganov/whisper.cpp)。 + +该文件格式[由抱抱脸支持](https://huggingface.co/docs/hub/en/gguf),可用于快速检查文件中张量和元数据。 + +该文件格式是一种“单文件格式”,通常单个文件就包含了配置属性、分词器词汇表和其他属性,同时还有模型中要加载的所有张量。这些文件根据文件的量化类型有不同的格式。我们在[这里](https://huggingface.co/docs/hub/en/gguf#quantization-types)进行了简要介绍。 + +## 在 Transformers 中的支持 + +我们在 transformers 中添加了加载 gguf 文件的功能,这样可以对 GGUF 模型进行进一步的训练或微调,然后再将模型转换回 GGUF 格式,以便在 ggml 生态系统中使用。加载模型时,我们首先将其反量化为 FP32,然后再加载权重以在 PyTorch 中使用。 + +> [!注意] +> 目前这个功能还处于探索阶段,欢迎大家贡献力量,以便在不同量化类型和模型架构之间更好地完善这一功能。 + +目前,支持的模型架构和量化类型如下: + +### 支持的量化类型 + +根据分享在 Hub 上的较为热门的量化文件,初步支持以下量化类型: + +- F32 +- F16 +- BF16 +- Q4_0 +- Q4_1 +- Q5_0 +- Q5_1 +- Q8_0 +- Q2_K +- Q3_K +- Q4_K +- Q5_K +- Q6_K +- IQ1_S +- IQ1_M +- IQ2_XXS +- IQ2_XS +- IQ2_S +- IQ3_XXS +- IQ3_S +- IQ4_XS +- IQ4_NL + +> [!注意] +> 为了支持 gguf 反量化,需要安装 `gguf>=0.10.0`。 + +### 支持的模型架构 + +目前支持以下在 Hub 上非常热门的模型架构: + +- LLaMa +- Mistral +- Qwen2 +- Qwen2Moe +- Phi3 +- Bloom +- Falcon +- StableLM +- GPT2 +- Starcoder2 + +## 使用示例 + +为了在`transformers`中加载`gguf`文件,你需要在 `from_pretrained`方法中为分词器和模型指定 `gguf_file`参数。下面是从同一个文件中加载分词器和模型的示例: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" +filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf" + +tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename) +model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) +``` + +现在,你就已经可以结合 PyTorch 生态系统中的一系列其他工具,来使用完整的、未量化的模型了。 + +为了将模型转换回`gguf`文件,我们建议使用`llama.cpp`中的[`convert-hf-to-gguf.py`文件](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)。 + +以下是如何补充上面的脚本,以保存模型并将其导出回 `gguf`的示例: + +```py +tokenizer.save_pretrained('directory') +model.save_pretrained('directory') + +!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory} +``` \ No newline at end of file From b29c24ff1ed130d717c59b58091cfedb652872d0 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 24 Oct 2024 13:44:53 +0200 Subject: [PATCH 093/385] CI: fix failures (#34371) fix --- src/transformers/models/video_llava/modeling_video_llava.py | 1 + tests/models/instructblip/test_modeling_instructblip.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index c4ec1b5196929a..0fe89676b92d63 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -561,6 +561,7 @@ def forward( ) video_features = None + num_frames = 0 if pixel_values_videos is not None: video_features, num_frames = self.get_video_features( pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 5182ac20cd993e..a33be021353f72 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -621,7 +621,7 @@ def test_inference_vicuna_7b(self): logits = model(**inputs).logits expected_slice = torch.tensor( - [[-3.3926, -12.2969, 8.4922], [-5.0195, -11.9531, 8.1406], [-4.0039, -13.3594, 9.2578]], + [[-3.3047, -12.0625, 8.4922], [-4.9258, -11.7578, 8.1406], [-3.9297, -13.5000, 9.2500]], device=torch_device, ) From 2112027d0cb8ae83ea9343176d77cb8a642c4556 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 24 Oct 2024 14:29:33 +0200 Subject: [PATCH 094/385] Zamba is an LM (#34342) * Zamba is an LM * Addition --- docs/source/en/_toctree.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index aa975fc9d9fe6b..a7806059afaa59 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -606,6 +606,8 @@ title: XLNet - local: model_doc/yoso title: YOSO + - local: model_doc/zamba + title: Zamba title: Text models - isExpanded: false sections: @@ -715,8 +717,6 @@ title: ViTMSN - local: model_doc/yolos title: YOLOS - - local: model_doc/zamba - title: Zamba - local: model_doc/zoedepth title: ZoeDepth title: Vision models @@ -973,4 +973,4 @@ - local: internal/time_series_utils title: Utilities for Time Series title: Internal Helpers - title: API \ No newline at end of file + title: API From 30c76d5b2836b9ba6b0e417aafb5ba77b4129ffe Mon Sep 17 00:00:00 2001 From: Thomas Furtner Date: Thu, 24 Oct 2024 14:42:47 +0200 Subject: [PATCH 095/385] add code generation to natural language processing section (#34333) --- docs/source/en/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ce0ffc7db0512f..aaff45ab65dfb6 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -19,7 +19,7 @@ State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFl 🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: -📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
+📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, code generation, summarization, translation, multiple choice, and text generation.
🖼️ **Computer Vision**: image classification, object detection, and segmentation.
🗣️ **Audio**: automatic speech recognition and audio classification.
🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. From dd267fca729621cec18b6199b31671ed9513a82c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A4=80=EC=9E=AC?= <55151385+junejae@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:10:59 +0900 Subject: [PATCH 096/385] Add T5 GGUF loading support (#33389) * add: GGUFT5Converter * add: tensormapping for t5 * add: test code for t5 * fix: Remove whitespace from blank line * add: t5 fp16 tests * fix: whitespace formatting * fix: minor formatting * fix: testing every weights --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 128 +++++++++++++++++- .../modeling_gguf_pytorch_utils.py | 17 ++- .../models/t5/tokenization_t5_fast.py | 2 +- tests/quantization/ggml/test_ggml.py | 56 +++++++- 5 files changed, 197 insertions(+), 7 deletions(-) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 01583cedbf4110..20531b990bc341 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -85,6 +85,7 @@ For now the supported model architectures are the architectures that have been v - StableLM - GPT2 - Starcoder2 +- T5 ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 7b5828176ffcf4..4a2740fcb30e1c 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -21,11 +21,11 @@ from array import array import numpy as np -from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers -from tokenizers.models import BPE +from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import BPE, Unigram from .. import AddedToken -from ..convert_slow_tokenizer import GPT2Converter, LlamaConverter, Qwen2Converter +from ..convert_slow_tokenizer import GPT2Converter, LlamaConverter, Qwen2Converter, T5Converter from ..utils import logging from ..utils.logging import tqdm @@ -148,6 +148,51 @@ ".output.": ".lm_head.", "output_norm": "ln_f", }, + "t5": { + "token_embd": "shared", + "dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q", + "dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k", + "dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v", + "dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o", + "dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm", + "dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q", + "dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k", + "dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v", + "dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o", + "dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm", + "dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", + "dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", + "dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo", + "dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm", + "dec.output_norm": "decoder.final_layer_norm", + "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", + "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", + "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", + "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", + "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", + "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", + "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", + "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", + "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", + "enc.output_norm": "encoder.final_layer_norm", + "output.weight": "lm_head.weight", + }, + "t5encoder": { + "token_embd": "shared", + "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", + "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", + "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", + "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", + "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", + "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", + "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", + "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", + "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", + "enc.output_norm": "encoder.final_layer_norm", + }, "stablelm": { "token_embd": "model.embed_tokens", "blk": "model.layers", @@ -287,6 +332,19 @@ "vocab_size": "vocab_size", "attention.layer_norm_epsilon": "layer_norm_epsilon", }, + "t5": { + "context_length": "n_positions", + "block_count": "num_layers", + "feed_forward_length": "d_ff", + "embedding_length": "d_model", + "attention.key_length": "d_kv", + "attention.head_count": "num_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_epsilon": "layer_norm_epsilon", + "attention.relative_buckets_count": "relative_attention_num_buckets", + "decoder_start_token_id": "decoder_start_token_id", + "vocab_size": "vocab_size", + }, "stablelm": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -636,6 +694,69 @@ def converted(self) -> Tokenizer: return tokenizer +class GGUFT5Converter(T5Converter): + def __init__(self, tokenizer_dict): + # set dummy data to avoid unnecessary merges calculation + tokenizer_dict["merges"] = ["dummy text"] + + self.proto = GGUFTokenizerSkeleton(tokenizer_dict) + self.token2id = {k: v for v, k in enumerate(self.proto.tokens)} + self.original_tokenizer = self.proto + self.additional_kwargs = {} + + def vocab(self, proto): + return list(zip(proto.tokens, proto.scores)) + + def normalizer(self, proto): + if getattr(self.original_tokenizer, "legacy", True): + sequence = [] + if getattr(self.original_tokenizer, "add_prefix_space", True): + sequence += [normalizers.Prepend(prepend="▁")] + sequence += [normalizers.Replace(pattern=" ", content="▁")] + return normalizers.Sequence(sequence) + return None # non-legacy, no normalizer + + def post_processor(self): + return processors.TemplateProcessing( + single=["$A", ""], + pair=["$A", "", "$B", ""], + special_tokens=[ + ("", self.token2id[""]), + ], + ) + + def converted(self) -> Tokenizer: + vocab_scores = self.vocab(self.proto) + tokenizer = Tokenizer( + Unigram( + vocab_scores, + unk_id=self.proto.unk_token_id, + byte_fallback=False, + ) + ) + + # Tokenizer assemble + normalizer = self.normalizer(self.proto) + if normalizer is not None: + tokenizer.normalizer = normalizer + + replacement = "▁" + add_prefix_space = True + if hasattr(self.original_tokenizer, "add_prefix_space"): + add_prefix_space = self.original_tokenizer.add_prefix_space + + pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) + if pre_tokenizer is not None: + tokenizer.pre_tokenizer = pre_tokenizer + + tokenizer.decoder = self.decoder(replacement, add_prefix_space) + post_processor = self.post_processor() + if post_processor: + tokenizer.post_processor = post_processor + + return tokenizer + + GGUF_TO_FAST_CONVERTERS = { "llama": GGUFLlamaConverter, "qwen2": GGUFQwen2Converter, @@ -646,6 +767,7 @@ def converted(self) -> Tokenizer: "stablelm": GGUFGPTConverter, "gpt2": GGUFGPTConverter, "starcoder2": GGUFGPTConverter, + "t5": GGUFT5Converter, } diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index b1d7b896085476..171b2f4d15b122 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -94,6 +94,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # to add this patch to ensure things work correctly on our side. if "llama" in architecture and "mistral" in model_name: updated_architecture = "mistral" + # FIXME: Currnetly this implementation is only for flan-t5 architecture. + # It needs to be developed for supporting legacy t5. + elif "t5" in architecture or "t5encoder" in architecture: + parsed_parameters["config"]["tie_word_embeddings"] = False + parsed_parameters["config"]["is_gated_act"] = True + updated_architecture = "t5" else: updated_architecture = architecture @@ -191,6 +197,13 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): else: weights = reverse_reshape_bias(weights, num_heads, n_embed) + bid = None + if architecture in ("t5", "t5encoder"): + for chunk in name.split("."): + if chunk.isdigit(): + bid = int(chunk) + break + if architecture == "gpt2": if ( "attn_qkv.weight" in name @@ -209,8 +222,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): continue for tensor_name in tensor_key_mapping: - if tensor_name in name: - name = name.replace(tensor_name, tensor_key_mapping[tensor_name]) + if tensor_name.format(bid=bid) in name: + name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid)) # Use copy to avoid errors with numpy and pytorch parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 0a92803f165846..4c3fa950559637 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -117,7 +117,7 @@ def __init__( kwargs["from_slow"] = True super().__init__( - vocab_file, + vocab_file=vocab_file, tokenizer_file=tokenizer_file, eos_token=eos_token, unk_token=unk_token, diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 6e47d46f07c47e..ddc791e96a6489 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -15,7 +15,7 @@ import tempfile import unittest -from transformers import AddedToken, AutoModelForCausalLM, AutoTokenizer +from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer from transformers.testing_utils import ( require_gguf, require_torch_gpu, @@ -48,6 +48,8 @@ class GgufIntegrationTests(unittest.TestCase): falcon7b_model_id = "xaviviro/falcon-7b-quantized-gguf" falcon40b_model_id = "maddes8cht/tiiuae-falcon-40b-gguf" original_flacon7b_model_id = "tiiuae/falcon-7b" + t5_model_id = "repetitio/flan-t5-small" + original_t5_model_id = "google/flan-t5-small" stablelm_model_id = "afrideva/stablelm-3b-4e1t-GGUF" stablelm2_model_id = "afrideva/stablelm-2-1_6b-GGUF" original_stablelm2_model_id = "stabilityai/stablelm-2-1_6b" @@ -92,6 +94,8 @@ class GgufIntegrationTests(unittest.TestCase): q2_k_falcon7b_model_id = "falcon-7b-q2_k.gguf" fp16_falcon7b_model_id = "falcon-7b-fp16.gguf" q2_k_falcon40b_model_id = "tiiuae-falcon-40b-Q2_K.gguf" + fp16_t5_model_id = "flan-t5-small-f16.gguf" + q8_0_t5_model_id = "flan-t5-small-q8_0.gguf" fp16_qwen2moe_model_id = "Qwen1.5-MoE-A2.7B.gguf" fp16_gpt2_model_id = "gpt2.f16.gguf" q8_gpt2_model_id = "gpt2.Q8_0.gguf" @@ -487,6 +491,56 @@ def test_bloom_weights_conversion_fp16(self): self.assertTrue(quantized_param.shape == original_param.shape) torch.testing.assert_close(quantized_param, original_param) + def test_t5_f16(self): + tokenizer = AutoTokenizer.from_pretrained(self.t5_model_id, gguf_file=self.fp16_t5_model_id) + model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, gguf_file=self.fp16_t5_model_id, device_map="auto", torch_dtype=torch.float16 + ) + + T5_EXAMPLE_TEXT = "translate English to German: How old are you?" + + text = tokenizer(T5_EXAMPLE_TEXT, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Wie ich er?" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_t5_q8_0(self): + tokenizer = AutoTokenizer.from_pretrained(self.t5_model_id, gguf_file=self.q8_0_t5_model_id) + model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, gguf_file=self.q8_0_t5_model_id, device_map="auto", torch_dtype=torch.float16 + ) + + T5_EXAMPLE_TEXT = "translate English to German: How old are you?" + + text = tokenizer(T5_EXAMPLE_TEXT, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Wie ich er?" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_t5_weights_conversion_fp16(self): + quantized_model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, + gguf_file=self.fp16_t5_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + original_model = AutoModelForSeq2SeqLM.from_pretrained( + self.original_t5_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + + quantized_state_dict = quantized_model.state_dict() + original_state_dict = original_model.state_dict() + + for (quantized_name, quantized_param), (original_name, original_param) in zip( + quantized_state_dict.items(), original_state_dict.items() + ): + self.assertTrue(quantized_param.shape == original_param.shape) + torch.testing.assert_close(quantized_param, original_param, rtol=5e-04, atol=5e-04) + def test_gpt2_q8(self): tokenizer = AutoTokenizer.from_pretrained(self.gpt2_model_id, gguf_file=self.q8_gpt2_model_id) model = AutoModelForCausalLM.from_pretrained( From 6432ad8bb5dec9c7ece1041767c9e208ff6b4cbb Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:22:50 -0400 Subject: [PATCH 097/385] Fix pil_torch_interpolation_mapping import in image_processing_detr_fast (#34375) fix pil_torch_interpolation_mapping import --- src/transformers/models/detr/image_processing_detr_fast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 97940ab3132dda..0fa1d0ffd9dba9 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -40,7 +40,6 @@ get_image_type, infer_channel_dimension_format, make_list_of_images, - pil_torch_interpolation_mapping, validate_annotations, validate_kwargs, ) @@ -72,7 +71,8 @@ if is_torchvision_available(): from torchvision.io import read_image - from ...image_utils import pil_torch_interpolation_mapping + if is_vision_available(): + from ...image_utils import pil_torch_interpolation_mapping if is_torchvision_v2_available(): from torchvision.transforms.v2 import functional as F From 450b9cbfacc5b5aaf18ecc25217ab80b6fc8cf99 Mon Sep 17 00:00:00 2001 From: Vijay Date: Thu, 24 Oct 2024 20:58:51 +0530 Subject: [PATCH 098/385] Add code sample docstrings and checkpoint reference for GLM models (#34360) * Add code sample docstrings and checkpoint reference for GLM models * Update modular_glm.py * Update modeling_glm.py --- src/transformers/models/glm/modeling_glm.py | 3 +++ src/transformers/models/glm/modular_glm.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 6354e20e33fe8c..1e7c6eae0ee719 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -59,6 +59,9 @@ _CHECKPOINT_FOR_DOC = "dummy" +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" + + class GlmRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index c26477fdc173b1..9cfd617eeb2353 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -44,6 +44,8 @@ from .configuration_glm import GlmConfig +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" + logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "dummy" From e2886166065db25029afb58c699d6272baf22965 Mon Sep 17 00:00:00 2001 From: "Winston H." <56998716+winstxnhdw@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:40:26 +0100 Subject: [PATCH 099/385] refactor: remove redundant if-condition and improve type correctness for `convert_tokens_to_ids` (#34030) * chore: remove redundant if-condition * fix: import `Iterable` --- src/transformers/tokenization_utils_fast.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index cec91e038dd054..fabc1a1d5ed81c 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -21,7 +21,7 @@ import json import os from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import tokenizers.pre_tokenizers as pre_tokenizers_fast from tokenizers import Encoding as EncodingFast @@ -326,20 +326,17 @@ def _convert_encoding( return encoding_dict, encodings - def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + def convert_tokens_to_ids(self, tokens: Union[str, Iterable[str]]) -> Union[int, List[int]]: """ - Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the vocabulary. Args: - tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). + tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. """ - if tokens is None: - return None - if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) From fe3507331998e7154a206055b34e3ba338290d3d Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:46:39 -0400 Subject: [PATCH 100/385] Ignore unsupported kwarg in ProcessorMixin call (#34285) Fix accept any common kwargs --- src/transformers/processing_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index cb2327e5c46b0d..b363f8c72cc481 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -874,7 +874,11 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key in kwargs: - if key not in used_keys: + if key not in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) + elif key not in used_keys: output_kwargs["common_kwargs"][key] = kwargs[key] # all modality-specific kwargs are updated with common kwargs From d9989e0b9a5633db923f12e61cb8b6e72cf71a7c Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 24 Oct 2024 17:56:40 +0200 Subject: [PATCH 101/385] [PEFT] Add warning for missing key in LoRA adapter (#34068) When loading a LoRA adapter, so far, there was only a warning when there were unexpected keys in the checkpoint. Now, there is also a warning when there are missing keys. This change is consistent with https://github.com/huggingface/peft/pull/2118 in PEFT and the planned PR https://github.com/huggingface/diffusers/pull/9622 in diffusers. Apart from this change, the error message for unexpected keys was slightly altered for consistency (it should be more readable now). Also, besides adding a test for the missing keys warning, a test for unexpected keys warning was also added, as it was missing so far. --- src/transformers/integrations/peft.py | 24 +++++- .../peft_integration/test_peft_integration.py | 78 ++++++++++++++++++- 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index bd0ca16f865f4c..8afff36eb08625 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -235,13 +235,29 @@ def load_adapter( ) if incompatible_keys is not None: - # check only for unexpected keys + err_msg = "" + origin_name = peft_model_id if peft_model_id is not None else "state_dict" + # Check for unexpected keys. if hasattr(incompatible_keys, "unexpected_keys") and len(incompatible_keys.unexpected_keys) > 0: - logger.warning( - f"Loading adapter weights from {peft_model_id} led to unexpected keys not found in the model: " - f" {incompatible_keys.unexpected_keys}. " + err_msg = ( + f"Loading adapter weights from {origin_name} led to unexpected keys not found in the model: " + f"{', '.join(incompatible_keys.unexpected_keys)}. " ) + # Check for missing keys. + missing_keys = getattr(incompatible_keys, "missing_keys", None) + if missing_keys: + # Filter missing keys specific to the current adapter, as missing base model keys are expected. + lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k] + if lora_missing_keys: + err_msg += ( + f"Loading adapter weights from {origin_name} led to missing keys in the model: " + f"{', '.join(lora_missing_keys)}" + ) + + if err_msg: + logger.warning(err_msg) + # Re-dispatch model and hooks in case the model is offloaded to CPU / Disk. if ( (getattr(self, "hf_device_map", None) is not None) diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py index a80919dc61cf3f..aebf2b295267c4 100644 --- a/tests/peft_integration/test_peft_integration.py +++ b/tests/peft_integration/test_peft_integration.py @@ -20,8 +20,9 @@ from huggingface_hub import hf_hub_download from packaging import version -from transformers import AutoModelForCausalLM, OPTForCausalLM +from transformers import AutoModelForCausalLM, OPTForCausalLM, logging from transformers.testing_utils import ( + CaptureLogger, require_bitsandbytes, require_peft, require_torch, @@ -72,9 +73,15 @@ def test_peft_from_pretrained(self): This checks if we pass a remote folder that contains an adapter config and adapter weights, it should correctly load a model that has adapters injected on it. """ + logger = logging.get_logger("transformers.integrations.peft") + for model_id in self.peft_test_model_ids: for transformers_class in self.transformers_test_model_classes: - peft_model = transformers_class.from_pretrained(model_id).to(torch_device) + with CaptureLogger(logger) as cl: + peft_model = transformers_class.from_pretrained(model_id).to(torch_device) + # ensure that under normal circumstances, there are no warnings about keys + self.assertNotIn("unexpected keys", cl.out) + self.assertNotIn("missing keys", cl.out) self.assertTrue(self._check_lora_correctly_converted(peft_model)) self.assertTrue(peft_model._hf_peft_config_loaded) @@ -548,3 +555,70 @@ def test_peft_from_pretrained_hub_kwargs(self): model = OPTForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs) self.assertTrue(self._check_lora_correctly_converted(model)) + + def test_peft_from_pretrained_unexpected_keys_warning(self): + """ + Test for warning when loading a PEFT checkpoint with unexpected keys. + """ + from peft import LoraConfig + + logger = logging.get_logger("transformers.integrations.peft") + + for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids): + for transformers_class in self.transformers_test_model_classes: + model = transformers_class.from_pretrained(model_id).to(torch_device) + + peft_config = LoraConfig() + state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin") + dummy_state_dict = torch.load(state_dict_path) + + # add unexpected key + dummy_state_dict["foobar"] = next(iter(dummy_state_dict.values())) + + with CaptureLogger(logger) as cl: + model.load_adapter( + adapter_state_dict=dummy_state_dict, peft_config=peft_config, low_cpu_mem_usage=False + ) + + msg = "Loading adapter weights from state_dict led to unexpected keys not found in the model: foobar" + self.assertIn(msg, cl.out) + + def test_peft_from_pretrained_missing_keys_warning(self): + """ + Test for warning when loading a PEFT checkpoint with missing keys. + """ + from peft import LoraConfig + + logger = logging.get_logger("transformers.integrations.peft") + + for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids): + for transformers_class in self.transformers_test_model_classes: + model = transformers_class.from_pretrained(model_id).to(torch_device) + + peft_config = LoraConfig() + state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin") + dummy_state_dict = torch.load(state_dict_path) + + # remove a key so that we have missing keys + key = next(iter(dummy_state_dict.keys())) + del dummy_state_dict[key] + + with CaptureLogger(logger) as cl: + model.load_adapter( + adapter_state_dict=dummy_state_dict, + peft_config=peft_config, + low_cpu_mem_usage=False, + adapter_name="other", + ) + + # Here we need to adjust the key name a bit to account for PEFT-specific naming. + # 1. Remove PEFT-specific prefix + # If merged after dropping Python 3.8, we can use: key = key.removeprefix(peft_prefix) + peft_prefix = "base_model.model." + key = key[len(peft_prefix) :] + # 2. Insert adapter name + prefix, _, suffix = key.rpartition(".") + key = f"{prefix}.other.{suffix}" + + msg = f"Loading adapter weights from state_dict led to missing keys in the model: {key}" + self.assertIn(msg, cl.out) From 1c5918d9106ba530c700f08ad7847b09a8b68457 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Thu, 24 Oct 2024 18:34:28 +0200 Subject: [PATCH 102/385] Fix `torch.fx` issue related to the new `loss_kwargs` keyword argument (#34380) * Fix FX * Unskip tests --- src/transformers/utils/fx.py | 2 +- tests/models/cohere/test_modeling_cohere.py | 1 - tests/models/mistral/test_modeling_mistral.py | 1 - tests/models/mixtral/test_modeling_mixtral.py | 1 - tests/models/qwen2/test_modeling_qwen2.py | 1 - tests/models/qwen2_moe/test_modeling_qwen2_moe.py | 1 - 6 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index c78b4c34c331f0..3764f1ee4cef76 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -1416,7 +1416,7 @@ def keys(self, obj: "Proxy") -> Any: your custom tracer. """ attribute = HFAttribute(obj, "keys")() - if obj.node.target == "**kwargs": + if obj.node.target.startswith("**"): return attribute._metadata return attribute diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index b8a5aec9d4153a..3a05867dfdfc8c 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -304,7 +304,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index 13e5e3d1f609e9..600c4ffa14b0d0 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -356,7 +356,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0bfb5126ebd1ca..0688435e81423c 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -356,7 +356,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 769d6caabd92f4..301937079ae694 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -368,7 +368,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 374d9472ca2793..30d7996d7e7b09 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -391,7 +391,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() From 4c6e0c92527f54c51fc20c1781ab42aeb946f25e Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 24 Oct 2024 18:42:03 +0200 Subject: [PATCH 103/385] Correct the new defaults (#34377) * Correct the new defaults * CIs * add check * Update utils.py * Update utils.py * Add the max_length in generate test checking shape without passing length * style * CIs * fix fx CI issue --- src/transformers/generation/utils.py | 5 ++++- .../encoder_decoder/test_modeling_encoder_decoder.py | 4 +++- .../test_modeling_speech_encoder_decoder.py | 4 +++- .../test_modeling_vision_encoder_decoder.py | 7 ++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 3938457155d83f..efe953db051cb3 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1440,8 +1440,11 @@ def _prepare_generated_length( and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] - else: # by default let's always generate 10 new tokens + elif has_default_max_length: # by default let's always generate 20 new tokens generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 0ee4b75ed803e3..64ebedcb45984b 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -488,7 +488,9 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - input_ids, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + input_ids, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (input_ids.shape[0],) + (decoder_config.max_length,)) diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py index 6e0b7fa9782fbc..7dcb7c406ae287 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py @@ -362,7 +362,9 @@ def check_encoder_decoder_model_generate( # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + inputs, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,)) diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index 7def8a9ac96507..77e2a19fea4861 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -306,7 +306,9 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + inputs, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,)) @@ -873,6 +875,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) @@ -990,6 +993,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) @@ -1107,6 +1111,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) From a308d28d397af77c6a6b6d3b397991b555677007 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 24 Oct 2024 19:07:23 +0200 Subject: [PATCH 104/385] [auto. ping] Avoid sending empty info + add more team members (#34383) * update * update --------- Co-authored-by: ydshieh --- .github/workflows/check_failed_model_tests.yml | 2 +- utils/check_bad_commit.py | 10 +++++++++- utils/process_bad_commit_report.py | 13 ++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml index f229765994d585..f3ea8646900ad2 100644 --- a/.github/workflows/check_failed_model_tests.yml +++ b/.github/workflows/check_failed_model_tests.yml @@ -106,7 +106,7 @@ jobs: } >> "$GITHUB_ENV" - name: Send processed report - if: ${{ env.REPORT_TEXT != '' }} + if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 with: # Slack channel id, channel name, or user id to post message. diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index 091ed5c4a427f9..adb25f11264b12 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -182,7 +182,15 @@ def get_commit_info(commit): info = {"test": test, "commit": commit} info.update(get_commit_info(commit)) failed_tests_with_bad_commits.append(info) - reports[model]["single-gpu"] = failed_tests_with_bad_commits + + # If no single-gpu test failures, remove the key + if len(failed_tests_with_bad_commits) > 0: + reports[model]["single-gpu"] = failed_tests_with_bad_commits + else: + reports[model].pop("single-gpu", None) + + # remove the models without any test failure + reports = {k: v for k, v in reports.items() if len(v) > 0} with open(args.output_file, "w", encoding="UTF-8") as fp: json.dump(reports, fp, ensure_ascii=False, indent=4) diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index f61f1b106644aa..513dc8df3a3b3c 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -28,7 +28,18 @@ data = json.load(fp) # TODO: extend - team_members = ["ydshieh", "zucchini-nlp", "ArthurZucker", "gante", "LysandreJik", "molbap", "qubvel"] + team_members = [ + "ydshieh", + "zucchini-nlp", + "ArthurZucker", + "gante", + "LysandreJik", + "molbap", + "qubvel", + "Rocketknight1", + "muellerzr", + "SunMarc", + ] # Counting the number of failures grouped by authors new_data = {} From 3d99f1746e0d667cbec9e69b4ec11289c4752630 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 24 Oct 2024 19:17:52 +0200 Subject: [PATCH 105/385] Fix glm (#34388) * Fix duplicated * fix import --- src/transformers/models/glm/modeling_glm.py | 13 ++----------- src/transformers/models/glm/modular_glm.py | 4 +--- src/transformers/models/phi3/modeling_phi3.py | 5 +---- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 1e7c6eae0ee719..5f8eaf89ed9353 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -30,6 +30,7 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -37,11 +38,11 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, - is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, @@ -49,16 +50,6 @@ from .configuration_glm import GlmConfig -if is_flash_attn_2_available(): - from ...modeling_flash_attention_utils import _flash_attention_forward - -from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward -from ...processing_utils import Unpack - - -_CHECKPOINT_FOR_DOC = "dummy" - - _CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 9cfd617eeb2353..39ee4a2ad5803e 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -44,11 +44,9 @@ from .configuration_glm import GlmConfig -_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" - logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "dummy" +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" class GlmRMSNorm(Phi3RMSNorm): diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 9e638c27afa41d..a1a86e3672d5fc 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -28,6 +28,7 @@ from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -39,7 +40,6 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, - is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, @@ -47,9 +47,6 @@ from .configuration_phi3 import Phi3Config -if is_flash_attn_2_available(): - from ...modeling_flash_attention_utils import _flash_attention_forward - logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" From 940a6bd343cfd2ff4f4425b4cbc548d1e1d316da Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 20:00:13 -0400 Subject: [PATCH 106/385] Use non nested images and batched text Idefics2/3 (#34222) * add support for non nested images and add tests * add tests error scenario * fix style * added single and no image to error tests --- .../idefics2/image_processing_idefics2.py | 1 + .../models/idefics2/processing_idefics2.py | 17 +++- .../idefics3/image_processing_idefics3.py | 3 + .../models/idefics3/processing_idefics3.py | 38 ++++++--- .../pixtral/image_processing_pixtral.py | 1 + .../idefics2/test_processor_idefics2.py | 77 +++++++++++++++--- .../idefics3/test_processor_idefics3.py | 79 ++++++++++++++++--- 7 files changed, 183 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index ac9df68871eee2..ce0032f80c5ece 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -99,6 +99,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 68566d182678c2..9a041257c36b5b 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -16,6 +16,7 @@ Processor class for IDEFICS2. """ +from itertools import accumulate from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -218,7 +219,21 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if text is not None: + if sum(n_images_in_text) != len(images): + raise ValueError( + f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed." + f" Found {sum(n_images_in_text)} {image_token} tokens and {len(images)} images." + ) + # Reorganize the images to match the prompts + cumsum_images_in_text = [0] + list(accumulate(n_images_in_text)) + images = [ + images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]] + for i in range(len(n_images_in_text)) + ] + else: + images = [images] + elif ( not isinstance(images, list) and not isinstance(images[0], list) diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index 495ac04595fbc6..05a1a396dc72d3 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -151,9 +151,11 @@ def get_resize_output_image_size( def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: """ Convert a single image or a list of images to a list of numpy arrays. + Args: images (`ImageInput`): A single image or a list of images. + Returns: A list of numpy arrays. """ @@ -168,6 +170,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index ceafa26a8b1187..872f5206f20175 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -17,6 +17,7 @@ """ import re +from itertools import accumulate from typing import TYPE_CHECKING, Dict, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -241,11 +242,31 @@ def __call__( n_images_in_images = [] inputs = BatchFeature() + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + n_images_in_text = [sample.count(self.image_token.content) for sample in text] + if images is not None: if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if text is not None: + if sum(n_images_in_text) != len(images): + raise ValueError( + f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed." + f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images." + ) + # Reorganize the images to match the prompts + cumsum_images_in_text = [0] + list(accumulate(n_images_in_text)) + images = [ + images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]] + for i in range(len(n_images_in_text)) + ] + else: + images = [images] elif ( not isinstance(images, list) and not isinstance(images[0], list) @@ -263,10 +284,10 @@ def __call__( inputs.update(image_inputs) if text is not None: - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if n_images_in_images != n_images_in_text: + raise ValueError( + f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." + ) image_rows = inputs.pop("rows", [[0] * len(text)]) image_cols = inputs.pop("cols", [[0] * len(text)]) @@ -277,8 +298,6 @@ def __call__( prompt_strings = [] for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): - n_images_in_text.append(sample.count(image_token)) - # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` image_prompt_strings = [] for n_rows, n_cols in zip(sample_rows, sample_cols): @@ -305,11 +324,6 @@ def __call__( text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) - if n_images_in_images != n_images_in_text: - raise ValueError( - f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." - ) - return inputs def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index a75704fc3dbac8..b4ec0e50c9ccc3 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -120,6 +120,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py index bf713c6fb8cfbb..d89004679aef0f 100644 --- a/tests/models/idefics2/test_processor_idefics2.py +++ b/tests/models/idefics2/test_processor_idefics2.py @@ -226,6 +226,73 @@ def test_add_special_tokens_processor(self): self.assertEqual(inputs["input_ids"], expected_input_ids) # fmt: on + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(inputs["pixel_values"].shape, (2, 2, 3, 767, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (2, 2, 767, 980)) + + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + def test_apply_chat_template(self): # Message contains content which a mix of lists with images and image urls and string messages = [ @@ -275,13 +342,3 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None): return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( batch_size - 2 ) - - # Override as PixtralProcessor needs nested images to work properly with batched inputs - @require_vision - def prepare_image_inputs(self, batch_size: Optional[int] = None): - """This function prepares a list of PIL images for testing""" - if batch_size is None: - return super().prepare_image_inputs() - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - return [[super().prepare_image_inputs()]] * batch_size diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index a53109b02b6951..52d2f1539a4867 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -250,6 +250,74 @@ def test_add_special_tokens_processor(self): self.assertEqual(inputs["input_ids"], expected_input_ids) # fmt: on + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 364, 364)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 364, 364)) + + # Copied from tests.models.idefics2.test_processor_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + def test_apply_chat_template(self): # Message contains content which a mix of lists with images and image urls and string messages = [ @@ -299,16 +367,7 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None): batch_size - 2 ) - # Override as Idefics3Processor needs nested images to work properly with batched inputs - @require_vision - def prepare_image_inputs(self, batch_size: Optional[int] = None): - """This function prepares a list of PIL images for testing""" - if batch_size is None: - return super().prepare_image_inputs() - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - return [[super().prepare_image_inputs()]] * batch_size - + # Override tests as inputs_ids padded dimension is the second one but not the last one @require_vision @require_torch def test_kwargs_overrides_default_tokenizer_kwargs(self): From 5779bac4c45b2c881603cafd20663892869d5860 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:44:09 +0200 Subject: [PATCH 107/385] Fix onnx non-expotable inplace aten op (#34376) * fix onnx non-expotable inplace op * mistral, qwen2, qwen2_vl, starcoder2 * fixup copies --- src/transformers/models/mimi/modeling_mimi.py | 2 +- src/transformers/models/mistral/modeling_mistral.py | 2 +- src/transformers/models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/moshi/modeling_moshi.py | 4 ++-- src/transformers/models/phi3/modeling_phi3.py | 2 +- src/transformers/models/phimoe/modeling_phimoe.py | 2 +- src/transformers/models/qwen2/modeling_qwen2.py | 2 +- src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 2 +- src/transformers/models/starcoder2/modeling_starcoder2.py | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 514f9de706ec63..cbdd2c663c5844 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1156,7 +1156,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index f198e4abc85511..321d3dc0daf378 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -961,7 +961,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 192b7801af0575..78a17178ecdda8 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1174,7 +1174,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 97200b7d042e61..9975996d21d144 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1385,7 +1385,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: @@ -1689,7 +1689,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a1a86e3672d5fc..bae3f6d4cdaeaa 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1136,7 +1136,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 791f6df50bb40f..f3690e5f686fbb 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1305,7 +1305,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 0d97f2ffb724a0..0883fac1aebafc 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1059,7 +1059,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 36de586265ce60..7f4f19aba1f3eb 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1239,7 +1239,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 4e9401c77e4d7d..90bf29c8b5d66a 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1321,7 +1321,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index c8f22dee43fe2c..1a8b6412e738e1 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1033,7 +1033,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: From 9f365fe0ac7fda3aa8adac6707f9368ac981cdd3 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 25 Oct 2024 11:02:07 +0200 Subject: [PATCH 108/385] Fix right padding in LLaVA models (#34305) * fix right pad llavas * device mismatch --- src/transformers/models/llava/modeling_llava.py | 7 ++++++- .../models/video_llava/modeling_video_llava.py | 7 ++++++- src/transformers/models/vipllava/modeling_vipllava.py | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 50b3d4c6a89533..0b2492fc711206 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -354,7 +354,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device ) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != image_features.shape[:-1].numel(): raise ValueError( diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 0fe89676b92d63..a9bd8b745a6f68 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -339,7 +339,12 @@ def _merge_input_ids_with_visual_features( # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != visual_features.shape[:-1].numel(): visual_type = "videos" if num_frames == 8 else "images" diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index dd7baa34406fb0..987ae0ad0c61fe 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -350,7 +350,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device ) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != image_features.shape[:-1].numel(): raise ValueError( From 223855314f879f99ace727cb11d748a2f5f1d48d Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:32:39 +0200 Subject: [PATCH 109/385] no filter (#34391) * no filter * no filter * no filter --------- Co-authored-by: ydshieh --- utils/tests_fetcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 9e15f2e115ec61..906e85e1de61a5 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -997,7 +997,7 @@ def _print_list(l) -> str: def infer_tests_to_run( output_file: str, diff_with_last_commit: bool = False, - filter_models: bool = True, + filter_models: bool = False, ): """ The main function called by the test fetcher. Determines the tests to run from the diff. @@ -1229,6 +1229,6 @@ def create_test_list_from_filter(full_test_list, out_path): infer_tests_to_run( args.output_file, diff_with_last_commit=diff_with_last_commit, - filter_models=(not (commit_flags["no_filter"] or is_main_branch)), + filter_models=False, ) filter_tests(args.output_file, ["repo_utils"]) From 8814043c8c62034277b04e73a44e25231ab020ad Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 25 Oct 2024 11:46:46 +0100 Subject: [PATCH 110/385] SynthID: better example (#34372) * better example * Update src/transformers/generation/configuration_utils.py * Update src/transformers/generation/logits_process.py * nits --- docs/source/en/internal/generation_utils.md | 4 +--- src/transformers/generation/configuration_utils.py | 10 +++++----- src/transformers/generation/logits_process.py | 10 +++++----- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index 946940cb019481..eb25ddb6329755 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -428,13 +428,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens - __call__ [[autodoc]] BayesianDetectorConfig - - __call__ [[autodoc]] BayesianDetectorModel - - __call__ + - forward [[autodoc]] SynthIDTextWatermarkingConfig - - __call__ [[autodoc]] SynthIDTextWatermarkDetector - __call__ diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index c460a19885afc5..3c204481b04296 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -1471,8 +1471,8 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig - >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') - >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left") + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b') >>> # SynthID Text configuration >>> watermarking_config = SynthIDTextWatermarkingConfig( @@ -1481,11 +1481,11 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): ... ) >>> # Generation with watermarking - >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True) >>> output_sequences = model.generate( - ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10 ... ) - >>> watermarked_text = tokenizer.batch_decode(output_sequences) + >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) ``` """ diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index fde95c7a85652f..9d244191da811c 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -2565,8 +2565,8 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig - >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') - >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left") + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b') >>> # SynthID Text configuration >>> watermarking_config = SynthIDTextWatermarkingConfig( @@ -2575,11 +2575,11 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): ... ) >>> # Generation with watermarking - >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True) >>> output_sequences = model.generate( - ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10 ... ) - >>> watermarked_text = tokenizer.batch_decode(output_sequences) + >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) ``` """ From 186b8dc190481032892d0a5d68b3db64f4ad4543 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 25 Oct 2024 11:55:07 +0100 Subject: [PATCH 111/385] Tests: upgrade `test_eager_matches_sdpa_generate` (#34386) --- tests/generation/test_utils.py | 82 +++++++++++ tests/models/bert/test_modeling_bert.py | 74 ---------- tests/models/cohere/test_modeling_cohere.py | 58 -------- tests/models/falcon/test_modeling_falcon.py | 74 ---------- tests/models/glm/test_modeling_glm.py | 71 --------- .../models/gpt_neox/test_modeling_gpt_neox.py | 64 +-------- tests/models/jetmoe/test_modeling_jetmoe.py | 9 -- tests/models/llama/test_modeling_llama.py | 62 -------- tests/models/mistral/test_modeling_mistral.py | 8 -- tests/models/mixtral/test_modeling_mixtral.py | 9 -- tests/models/mllama/test_modeling_mllama.py | 12 -- tests/models/moshi/test_modeling_moshi.py | 6 +- .../models/musicgen/test_modeling_musicgen.py | 136 ------------------ .../test_modeling_musicgen_melody.py | 68 --------- tests/models/olmo/test_modeling_olmo.py | 9 -- tests/models/olmoe/test_modeling_olmoe.py | 9 -- tests/models/opt/test_modeling_opt.py | 63 -------- tests/models/qwen2/test_modeling_qwen2.py | 8 -- .../qwen2_moe/test_modeling_qwen2_moe.py | 6 - .../models/stablelm/test_modeling_stablelm.py | 66 --------- .../test_modeling_xlm_roberta_xl.py | 81 +---------- tests/test_modeling_common.py | 56 -------- 22 files changed, 85 insertions(+), 946 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 4e5d8f30265995..6f2eaf734df14f 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -15,6 +15,7 @@ import copy +import gc import inspect import tempfile import unittest @@ -33,6 +34,7 @@ require_torch_gpu, require_torch_multi_accelerator, require_torch_multi_gpu, + require_torch_sdpa, slow, torch_device, ) @@ -2046,6 +2048,86 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() + inputs_dict = {} + for input_name, input_data in original_inputs_dict.items(): + if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]: + inputs_dict[input_name] = input_data.to(torch.float16) + else: + inputs_dict[input_name] = input_data + main_input = inputs_dict[model_class.main_input_name] + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + del model + gc.collect() + + generate_kwargs = { + "max_new_tokens": max_new_tokens, + "do_sample": False, + "return_dict_in_generate": True, + "output_scores": True, + } + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs) + del model_sdpa + gc.collect() + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) + del model_eager + gc.collect() + + # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this + # test would be flaky if we only checked the sequences. Two situations in which this test passes: + # 1. The sequences are the same + # 2. The sequences are different, but the scores up until the first mismatch are nearly identical + output_matches = res_eager.sequences == res_sdpa.sequences + has_matching_outputs = output_matches.all() + has_matching_scores = None + if not has_matching_outputs: + input_length = main_input.shape[1] + for batch_idx in range(res_eager.sequences.shape[0]): + batch_matches = output_matches[batch_idx] + if batch_matches.all(): + continue + first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False + first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens + sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx] + eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx] + has_matching_scores = torch.allclose( + sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3 + ) + if not has_matching_scores: + break + + self.assertTrue(has_matching_outputs or has_matching_scores) + def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 5c87fbea8ee795..8ac1c3d2b409d0 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -22,7 +22,6 @@ CaptureLogger, require_torch, require_torch_accelerator, - require_torch_sdpa, slow, torch_device, ) @@ -672,79 +671,6 @@ def test_torchscript_device_change(self): loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device) loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) - # This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True. - # TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True. - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - # low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - # low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class BertModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 3a05867dfdfc8c..cd3b2f978e7ab7 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -307,64 +307,6 @@ def test_model_various_embeddings(self): def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() - @require_bitsandbytes - @require_torch_sdpa - @require_torch_multi_gpu - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - model_id = "CohereForAI/c4ai-command-r-v01-4bit" - tokenizer = AutoTokenizer.from_pretrained(model_id) - - model_sdpa = CohereForCausalLM.from_pretrained( - model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto" - ) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = CohereForCausalLM.from_pretrained( - model_id, torch_dtype=torch.float16, attn_implementation="eager", device_map="auto" - ) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @require_torch @slow diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index a1a2b0155cb738..ce04fae94ea904 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Falcon model.""" -import tempfile import unittest from parameterized import parameterized @@ -27,7 +26,6 @@ set_seed, ) from transformers.testing_utils import ( - is_flaky, require_bitsandbytes, require_torch, require_torch_sdpa, @@ -520,78 +518,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - # NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason). - # for name, submodule in model_eager.named_modules(): - # if "SdpaAttention" in submodule.__class__.__name__: - # raise ValueError("The eager model should not have SDPA attention layers") - - # has_sdpa = False - # for name, submodule in model_sdpa.named_modules(): - # if "SdpaAttention" in submodule.__class__.__name__: - # has_sdpa = True - # break - # if not has_sdpa: - # raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class FalconLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index f703ccd5096d41..32bce7cbfa615e 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -758,77 +758,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - """Overwrite to add flakyness: outputs sometimes start to diverge after some tokens""" - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @slow @require_torch_accelerator diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 196f873696eb70..2c3319f02475cc 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -19,7 +19,7 @@ from parameterized import parameterized from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed -from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -434,68 +434,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Based on tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate - which also overwrites the common test as the test is flaky on tiny models. - """ - max_new_tokens = 30 - - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b") - - model_sdpa = GPTNeoXForCausalLM.from_pretrained( - "EleutherAI/pythia-1b", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = GPTNeoXForCausalLM.from_pretrained( - "EleutherAI/pythia-1b", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @require_torch class GPTNeoXLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index 867f97c48a68ab..a04d8bba741a23 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -24,11 +24,9 @@ from transformers import AutoTokenizer, JetMoeConfig, is_torch_available from transformers.testing_utils import ( backend_empty_cache, - is_flaky, require_flash_attn, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -302,13 +300,6 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix test_disk_offload_bin = False test_disk_offload_safetensors = False - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([(1, False), (1, True), (4, False)]) def test_new_cache_format(self, num_beams, do_sample): pass diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index bf7ca7848951c8..824337d8bdda01 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -32,7 +32,6 @@ require_torch, require_torch_accelerator, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -651,67 +650,6 @@ def test_use_flash_attention_2_true(self): if not has_flash: raise ValueError("The flash model should have flash attention layers") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = LlamaTokenizer.from_pretrained("saibo/llama-1B") - - model_sdpa = LlamaForCausalLM.from_pretrained( - "saibo/llama-1B", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = LlamaForCausalLM.from_pretrained( - "saibo/llama-1B", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @unittest.skip("Broken by the loss update will fix soon @ArthurZucker") def test_torch_fx_output_loss(self, *args, **kwargs): pass diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index 600c4ffa14b0d0..f2ee714bcdbafc 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -24,7 +24,6 @@ from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed from transformers.testing_utils import ( backend_empty_cache, - is_flaky, require_bitsandbytes, require_flash_attn, require_read_token, @@ -332,13 +331,6 @@ def is_pipeline_test_to_skip( ): return True - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = MistralModelTester(self) self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0688435e81423c..b9b5faed851fe4 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -21,11 +21,9 @@ from transformers import MixtralConfig, is_torch_available from transformers.testing_utils import ( - is_flaky, require_flash_attn, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -332,13 +330,6 @@ def is_pipeline_test_to_skip( ): return True - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = MixtralModelTester(self) self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37) diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index fafa2f71331ba3..3efa7b778fb75c 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -132,12 +132,6 @@ def setUp(self): self.model_tester = MllamaText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - class MllamaVisionText2TextModelTester: def __init__( @@ -360,12 +354,6 @@ def _check_attentions_for_generate( self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @require_torch_sdpa @slow @is_flaky() diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index b299b414d609b1..dd9302ee2c55ba 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -788,14 +788,10 @@ def test_left_padding_compatibility(self): @slow @is_flaky(max_attempts=5, description="flaky on some models.") def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") + """Overwritten -- mochi has custom inputs and custom output checks""" max_new_tokens = 5 - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - for model_class in self.all_generative_model_classes: if not model_class._supports_sdpa: self.skipTest(f"{model_class.__name__} does not support SDPA") diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 438178bfc6faa2..346ad60debe23f 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -819,74 +819,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def prepare_musicgen_inputs_dict( config, @@ -2085,74 +2017,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def test_requires_grad_with_frozen_encoders(self): config = self.model_tester.get_config() for model_class in self.all_model_classes: diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f53fc21ba80c09..f3b6be0ac652eb 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1866,74 +1866,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def test_requires_grad_with_frozen_encoders(self): config = self.model_tester.get_config() for model_class in self.all_model_classes: diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index fbe73248d00b7c..a85e9db34586f9 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -24,10 +24,8 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( - is_flaky, require_tokenizers, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -317,13 +315,6 @@ def test_model_various_embeddings(self): def test_save_load_fast_init_from_base(self): pass - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py index 08ec1458efe146..9efadb06eb416b 100644 --- a/tests/models/olmoe/test_modeling_olmoe.py +++ b/tests/models/olmoe/test_modeling_olmoe.py @@ -22,10 +22,8 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( - is_flaky, require_tokenizers, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -330,13 +328,6 @@ def test_model_various_embeddings(self): def test_save_load_fast_init_from_base(self): pass - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py index 2093dfe685b3ee..8bae2af804500b 100644 --- a/tests/models/opt/test_modeling_opt.py +++ b/tests/models/opt/test_modeling_opt.py @@ -25,7 +25,6 @@ require_torch, require_torch_accelerator, require_torch_fp16, - require_torch_sdpa, slow, torch_device, ) @@ -339,68 +338,6 @@ def test_opt_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350M") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - model_sdpa = OPTForCausalLM.from_pretrained( - "facebook/opt-350M", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="sdpa", - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = OPTForCausalLM.from_pretrained( - "facebook/opt-350M", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for _, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_model_parallelism(self): super().test_model_parallelism() diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 301937079ae694..4e57f8e0f002fb 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -343,14 +343,6 @@ def is_pipeline_test_to_skip( ): return True - # Ignore copy - # TODO: @Fxmarty - @require_torch_sdpa - @slow - @unittest.skip(reason="Currently failing.") - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = Qwen2ModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37) diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 30d7996d7e7b09..c545e882faeeb3 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -368,12 +368,6 @@ def is_pipeline_test_to_skip( ): return True - # Ignore copy - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = Qwen2MoeModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37) diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py index e1f9bc2b8e8f9f..91044a4eb750d1 100644 --- a/tests/models/stablelm/test_modeling_stablelm.py +++ b/tests/models/stablelm/test_modeling_stablelm.py @@ -21,11 +21,9 @@ from transformers import StableLmConfig, is_torch_available, set_seed from transformers.testing_utils import ( - is_flaky, require_bitsandbytes, require_flash_attn, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -558,67 +556,3 @@ def test_model_3b_long_prompt(self): input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist()) - - # Copied from transformers.tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate with Llama->StableLm,saibo/llama-1B->stabilityai/stablelm-3b-4e1t - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") - - model_sdpa = StableLmForCausalLM.from_pretrained( - "stabilityai/stablelm-3b-4e1t", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = StableLmForCausalLM.from_pretrained( - "stabilityai/stablelm-3b-4e1t", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 5b426d27799fbb..5d9abb238e793d 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -14,11 +14,10 @@ # limitations under the License. -import tempfile import unittest from transformers import XLMRobertaXLConfig, is_torch_available -from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -523,84 +522,6 @@ def test_create_position_ids_from_inputs_embeds(self): self.assertEqual(position_ids.shape, expected_positions.shape) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) - # TODO: Remove this and use the parent method (in common tests) once XLM RoBERTa XL supports low_cpu_mem_usage=True. - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - # Ignore copy - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=False, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - # Ignore copy - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=False, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class XLMRobertaModelXLIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 964b7b912b4e0f..51d51dfcc2825c 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4469,62 +4469,6 @@ def test_sdpa_can_compile_dynamic(self): with torch.no_grad(): _ = model(**inputs_dict) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch_sdpa def test_sdpa_matches_eager_sliding_window(self): if not self.has_attentions: From e447185b1f19df3032b11b586506225bfdf6d111 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 25 Oct 2024 10:23:20 -0400 Subject: [PATCH 112/385] Fix bnb training test failure (#34414) * Fix bnb training test: compatibility with OPTSdpaAttention --- tests/quantization/bnb/test_4bit.py | 3 ++- tests/quantization/bnb/test_mixed_int8.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 0ac9b3d82fc7b0..3eae429abb206a 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -29,6 +29,7 @@ BitsAndBytesConfig, pipeline, ) +from transformers.models.opt.modeling_opt import OPTAttention from transformers.testing_utils import ( apply_skip_if_not_implemented, is_bitsandbytes_available, @@ -565,7 +566,7 @@ def test_training(self): # Step 2: add adapters for _, module in model.named_modules(): - if "OPTAttention" in repr(type(module)): + if isinstance(module, OPTAttention): module.q_proj = LoRALayer(module.q_proj, rank=16) module.k_proj = LoRALayer(module.k_proj, rank=16) module.v_proj = LoRALayer(module.v_proj, rank=16) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 5a99ab32e42b8c..567aa956271b70 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -29,6 +29,7 @@ BitsAndBytesConfig, pipeline, ) +from transformers.models.opt.modeling_opt import OPTAttention from transformers.testing_utils import ( apply_skip_if_not_implemented, is_accelerate_available, @@ -868,7 +869,7 @@ def test_training(self): # Step 2: add adapters for _, module in model.named_modules(): - if "OPTAttention" in repr(type(module)): + if isinstance(module, OPTAttention): module.q_proj = LoRALayer(module.q_proj, rank=16) module.k_proj = LoRALayer(module.k_proj, rank=16) module.v_proj = LoRALayer(module.v_proj, rank=16) From f73f5e62e2383c1cb6975fca70082d6dc51ec6f2 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:14:07 +0200 Subject: [PATCH 113/385] Avoid check expected exception when it is on CUDA (#34408) * update * update --------- Co-authored-by: ydshieh --- .../pipelines/test_pipelines_summarization.py | 5 +++-- .../test_pipelines_text_generation.py | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py index 465dba9743c648..613b9dca8e1a71 100644 --- a/tests/pipelines/test_pipelines_summarization.py +++ b/tests/pipelines/test_pipelines_summarization.py @@ -85,8 +85,9 @@ def run_pipeline_test(self, summarizer, _): and len(summarizer.model.trainable_weights) > 0 and "GPU" in summarizer.model.trainable_weights[0].device ): - with self.assertRaises(Exception): - outputs = summarizer("This " * 1000) + if str(summarizer.device) == "cpu": + with self.assertRaises(Exception): + outputs = summarizer("This " * 1000) outputs = summarizer("This " * 1000, truncation=TruncationStrategy.ONLY_FIRST) @require_torch diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 277c870b4d1074..51f3cae5e31235 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -493,17 +493,19 @@ def run_pipeline_test(self, text_generator, _): and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS ): # Handling of large generations - with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)): - text_generator("This is a test" * 500, max_new_tokens=20) + if str(text_generator.device) == "cpu": + with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)): + text_generator("This is a test" * 500, max_new_tokens=20) outputs = text_generator("This is a test" * 500, handle_long_generation="hole", max_new_tokens=20) # Hole strategy cannot work - with self.assertRaises(ValueError): - text_generator( - "This is a test" * 500, - handle_long_generation="hole", - max_new_tokens=tokenizer.model_max_length + 10, - ) + if str(text_generator.device) == "cpu": + with self.assertRaises(ValueError): + text_generator( + "This is a test" * 500, + handle_long_generation="hole", + max_new_tokens=tokenizer.model_max_length + 10, + ) @require_torch @require_accelerate From 6a62a6d1b54123ede3a1e3bda57c924c64e78124 Mon Sep 17 00:00:00 2001 From: Rudy Delouya Date: Fri, 25 Oct 2024 17:52:29 +0200 Subject: [PATCH 114/385] Fix typos in agents_advanced.md (#34405) --- docs/source/en/agents_advanced.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index 2327357525d8d9..ddcc619b4f91f6 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -66,10 +66,10 @@ manager_agent.run("Who is the CEO of Hugging Face?") Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator. -If you need to add variation, like custom attributes for your too, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. +If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. The custom tool needs: -- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`. +- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`. - An attribute `description` is used to populate the agent's system prompt. - An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input. - An `output_type` attribute, which specifies the output type. @@ -240,4 +240,4 @@ with gr.Blocks() as demo: if __name__ == "__main__": demo.launch() -``` \ No newline at end of file +``` From 1d063793318b20654ebb850f48f43e0a247ab7bb Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 25 Oct 2024 08:52:45 -0700 Subject: [PATCH 115/385] [docs] Cache implementations (#34325) cache --- src/transformers/generation/configuration_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 3c204481b04296..9b543f6c35711d 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -172,7 +172,15 @@ class GenerationConfig(PushToHubMixin): speed up decoding. cache_implementation (`str`, *optional*, default to `None`): Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are: - {ALL_CACHE_IMPLEMENTATIONS}. We support other cache types, but they must be manually instantiated and + + - `"static"`: [`StaticCache`] + - `"offloaded_static"`: [`OffloadedStaticCache`] + - `"sliding_window"`: [`SlidingWindowCache`] + - `"hybrid"`: [`HybridCache`] + - `"mamba"`: [`MambaCache`] + - `"quantized"`: [`QuantizedCache`] + + We support other cache types, but they must be manually instantiated and passed to `generate` through the `past_key_values` argument. See our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information. cache_config (`CacheConfig` or `dict`, *optional*, default to `None`): From fddbd3c13cca7a51515a039c6f2497e94905acb4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:24:56 +0100 Subject: [PATCH 116/385] Fix pix2struct (#34374) * fix * fix and test use_cache test * style * remove atol --- .../models/pix2struct/modeling_pix2struct.py | 60 +++++++++++-------- .../pix2struct/test_modeling_pix2struct.py | 11 ++++ 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index b1ac81bb1f21b6..176dadd5b883e1 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -762,11 +762,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets return relative_buckets # Adapted from transformers.models.t5.modeling_t5.T5Attention.compute_bias - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -779,6 +782,7 @@ def compute_bias(self, query_length, key_length, device=None): values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) return values + # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward def forward( self, hidden_states, @@ -796,61 +800,66 @@ def forward( Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, query_length, key_length) + # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] # if key_value_states are provided this layer is used as a cross-attention layer for the decoder is_cross_attention = key_value_states is not None - query_states = self.query(hidden_states).contiguous() + query_states = self.query(hidden_states) query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache - past_key_value = past_key_value.cross_attention_cache + curr_past_key_value = past_key_value.cross_attention_cache else: - past_key_value = past_key_value.self_attention_cache + curr_past_key_value = past_key_value.self_attention_cache - # get key/value states current_states = key_value_states if is_cross_attention else hidden_states if is_cross_attention and past_key_value and is_updated: # reuse k,v, cross_attentions - key_states = past_key_value.key_cache[self.layer_idx] - value_states = past_key_value.value_cache[self.layer_idx] + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] else: - key_states = self.key(current_states).contiguous() - value_states = self.value(current_states).contiguous() + key_states = self.key(current_states) + value_states = self.value(current_states) key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + if past_key_value is not None: # save all key/value_states to cache to be re-used for fast auto-regressive generation cache_position = cache_position if not is_cross_attention else None - key_states, value_states = past_key_value.update( + key_states, value_states = curr_past_key_value.update( key_states, value_states, self.layer_idx, {"cache_position": cache_position} ) # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls if is_cross_attention: past_key_value.is_updated[self.layer_idx] = True - # compute scores + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: - real_seq_length = cache_position[-1] + 1 if query_length is None else query_length - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -860,10 +869,9 @@ def forward( position_bias_masked = position_bias scores += position_bias_masked - # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to @@ -871,12 +879,12 @@ def forward( attn_weights = attn_weights * layer_head_mask attn_output = torch.matmul(attn_weights, value_states) - # (batch_size, seq_length, dim) - attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.output(attn_output) - outputs = (attn_output,) + (past_key_value,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -969,7 +977,10 @@ def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optiona layer_idx=layer_idx, ) - self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config) + self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention( + config, + layer_idx=layer_idx, + ) self.mlp = Pix2StructTextLayerFF(config) @@ -1019,7 +1030,6 @@ def forward( query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, - cache_position=cache_position, ) hidden_states, past_key_value = cross_attention_outputs[:2] diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 2d762008cbbc3d..18b79f3fbc9c04 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -419,6 +419,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {} pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {} fx_compatible = False test_head_masking = False @@ -445,6 +446,16 @@ def test_model(self): ), ) + def test_generative_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_generative_model_classes: + model = model_class(config).eval().to(torch_device) + + output = model.generate(**input_dict, use_cache=False, min_new_tokens=10, max_new_tokens=10) + output_use_cache = model.generate(**input_dict, use_cache=True, min_new_tokens=10, max_new_tokens=10) + + torch.testing.assert_close(output, output_use_cache) + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass From fc465bb196c3f014b1be43aa599a6183e660cccc Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:59:46 +0100 Subject: [PATCH 117/385] pin `tensorflow_probability<0.22` in docker files (#34381) 0.21 Co-authored-by: ydshieh --- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-tensorflow-gpu/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 93f9b6f6a170fd..7ad4e96d62cde7 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -26,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). -RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA +RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA RUN python3 -m pip uninstall -y flax jax diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index d765767780f46c..378491a6c60007 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -18,7 +18,7 @@ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSIO RUN python3 -m pip uninstall -y torch flax RUN python3 -m pip install -U "itsdangerous<2.1.0" -RUN python3 -m pip install --no-cache-dir -U tensorflow_probability +RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22" # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. From 9360f1827d620c00d64755d40cd526dceabf5060 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 28 Oct 2024 12:01:05 +0100 Subject: [PATCH 118/385] Tiny update after #34383 (#34404) * update * update * update --------- Co-authored-by: ydshieh --- utils/check_bad_commit.py | 3 +++ utils/notification_service.py | 3 ++- utils/process_bad_commit_report.py | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index adb25f11264b12..45b01537127fac 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -75,6 +75,9 @@ def find_bad_commit(target_test, start_commit, end_commit): `str`: The earliest commit at which `target_test` fails. """ + if start_commit == end_commit: + return start_commit + create_script(target_test=target_test) bash = f""" diff --git a/utils/notification_service.py b/utils/notification_service.py index 629b793337889a..039ee8b29a3781 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -547,7 +547,8 @@ def payload(self) -> str: items = re.findall(pattern, line) elif "tests/models/" in line: model = line.split("/")[2] - new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} + if model not in new_failed_tests: + new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} for url, device in items: new_failed_tests[model][f"{device}-gpu"].append(line) file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index 513dc8df3a3b3c..19812ff21f7d19 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -64,6 +64,8 @@ for device, failed_tests in model_result.items(): failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author] model_result[device] = failed_tests + _data[model] = {k: v for k, v in model_result.items() if len(v) > 0} + new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0} # Upload to Hub and get the url with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: From 92bcdff2ef0932cf6dec4c3538389d7ccfd92f59 Mon Sep 17 00:00:00 2001 From: Nischay Date: Mon, 28 Oct 2024 17:53:52 +0530 Subject: [PATCH 119/385] Fix batch size handling in prediction_loop for DataLoaderShard (#34343) * Fix batch size handling in prediction_loop for DataLoaderShard Updated the prediction_loop method in the Trainer class to correctly handle batch size when using DataLoaderShard. This ensures that the batch size is retrieved from total_batch_size for distributed training scenarios, preventing TypeError related to NoneType during evaluation. * Update src/transformers/trainer.py Co-authored-by: Zach Mueller * Applied the fix to remove unused imports --------- Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1b13787007e9c3..8fe25b74661995 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4714,7 +4714,17 @@ def prediction_loop( elif args.bf16_full_eval: model = model.to(dtype=torch.bfloat16, device=args.device) - batch_size = dataloader.batch_size + batch_size = ( + dataloader.total_batch_size + if getattr(dataloader, "_is_accelerate_prepared", False) + else dataloader.batch_size + ) + + if batch_size is None: + raise ValueError( + "Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size." + ) + num_examples = self.num_examples(dataloader) logger.info(f"\n***** Running {description} *****") logger.info(f" Num examples = {num_examples}") From 8b3b9b48fcd6bc06bd9c576f1b09266d577db257 Mon Sep 17 00:00:00 2001 From: AbdelKarim ELJANDOUBI <78537694+eljandoubi@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:50:16 +0100 Subject: [PATCH 120/385] exclude fsdp from delay_optimizer_creation (#34140) * exclude fsdp from delay_optimizer_creation * add test case for trainer: FSDP mode and fp8 as mixed precision * rearrange imports * ruff formatted * adapt _init_fsdp to fp8 * use _init_fsdp only when resume_from_checkpoint * In case of FDP, self.layer will be CheckpointWrapper which has no len() method * delete _init_fsdp * solve conflict * fix conflict * make fixup --- src/transformers/testing_utils.py | 8 ++++++++ src/transformers/trainer.py | 7 +++++-- tests/trainer/test_trainer_fsdp.py | 32 ++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 7bb2d5049dccf8..2781e9e102e050 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -144,6 +144,7 @@ if is_accelerate_available(): from accelerate.state import AcceleratorState, PartialState + from accelerate.utils.imports import is_fp8_available if is_pytest_available(): @@ -1000,6 +1001,13 @@ def require_torch_fp16(test_case): )(test_case) +def require_fp8(test_case): + """Decorator marking a test that requires supports for fp8""" + return unittest.skipUnless(is_accelerate_available() and is_fp8_available(), "test requires fp8 support")( + test_case + ) + + def require_torch_bf16(test_case): """Decorator marking a test that requires a device that supports bf16""" return unittest.skipUnless( diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8fe25b74661995..64cb5c6bd4ddbb 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2209,7 +2209,7 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled # We need to reset the scheduler, as its parameters may be different on subsequent calls if self._created_lr_scheduler: @@ -2258,9 +2258,12 @@ def _inner_training_loop( # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX use_accelerator_prepare = True if model is self.model else False + # configure fsdp plugin for qlora if any + if use_accelerator_prepare: + self._fsdp_qlora_plugin_updates() + if delay_optimizer_creation: if use_accelerator_prepare: - self._fsdp_qlora_plugin_updates() self.model = self.accelerator.prepare(self.model) self.create_optimizer_and_scheduler(num_training_steps=max_steps) diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py index 994a82a8db0c44..4bcf5de04520e2 100644 --- a/tests/trainer/test_trainer_fsdp.py +++ b/tests/trainer/test_trainer_fsdp.py @@ -20,6 +20,8 @@ execute_subprocess_async, get_torch_dist_unique_port, require_accelerate, + require_fp8, + require_fsdp, require_torch_multi_gpu, ) @@ -64,6 +66,7 @@ def __getitem__(self, i: int) -> str: class TestFSDPTrainer(TestCasePlus): @require_accelerate @require_torch_multi_gpu + @require_fsdp def test_trainer(self): output_dir = self.get_auto_remove_tmp_dir() cmd = [ @@ -86,6 +89,35 @@ def test_trainer(self): # successful return here == success - any errors would have caused an error in the sub-call +class TestFSDPTrainerFP8(TestCasePlus): + @require_accelerate + @require_torch_multi_gpu + @require_fsdp + @require_fp8 + def test_trainer(self): + output_dir = self.get_auto_remove_tmp_dir() + cmd = [ + "accelerate", + "launch", + "--use_fsdp", + "--main_process_port", + f"{get_torch_dist_unique_port()}", + "--num_processes", + f"{torch.cuda.device_count()}", + "--mixed_precision", + "fp8", + "--fsdp_transformer_layer_cls_to_wrap", + "GPT2Block", + f"{self.test_file_dir}/test_trainer_fsdp.py", + "--output_dir", + f"{output_dir}", + "--report_to", + "none", + ] + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + + if __name__ == "__main__": parser = HfArgumentParser((Seq2SeqTrainingArguments,)) training_args = parser.parse_args_into_dataclasses()[0] From c1753436dbb8bcbcee183cdd6eba9f08a90d602a Mon Sep 17 00:00:00 2001 From: "Sean (Seok-Won) Yi" Date: Tue, 29 Oct 2024 00:02:22 +0900 Subject: [PATCH 121/385] New option called `"best"` for `args.save_strategy`. (#31817) * Add _determine_best_metric and new saving logic. 1. Logic to determine the best logic was separated out from `_save_checkpoint`. 2. In `_maybe_log_save_evaluate`, whether or not a new best metric was achieved is determined after each evaluation, and if the save strategy is "best' then the TrainerControl is updated accordingly. * Added SaveStrategy. Same as IntervalStrategy, but with a new attribute called BEST. * IntervalStrategy -> SaveStrategy * IntervalStratgy -> SaveStrategy for save_strat. * Interval -> Save in docstring. * Updated docstring for save_strategy. * Added SaveStrategy and made according changes. `save_strategy` previously followed `IntervalStrategy` but now follows `SaveStrategy`. Changes were made accordingly to the code and the docstring. * Changes from `make fixup`. * Removed redundant metrics argument. * Added new test_save_best_checkpoint test. 1. Checks for both cases where `metric_for_best_model` is explicitly provided and when it's not provided. 2. The first case should have two checkpoints saved, whereas the second should have three saved. * Changed should_training_end saving logic. The Trainer saves a checkpoints at the end of training by default as long as `save_strategy != SaveStrategy.NO`. This condition was modified to include `SaveStrategy.BEST` because it would be counterintuitive that we'd only want the best checkpoint to be saved but the last one is as well. * `args.metric_for_best_model` default to loss. * Undo metric_for_best_model update. * Remove checking metric_for_best_model. * Added test cases for loss and no metric. * Added error for metric and changed default best_metric. * Removed unused import. * `new_best_metric` -> `is_new_best_metric` Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Applied `is_new_best_metric` to all. Changes were made for consistency and also to fix a potential bug. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 84 ++++++++++++++++++---------- src/transformers/trainer_callback.py | 8 +-- src/transformers/trainer_utils.py | 7 +++ src/transformers/training_args.py | 14 +++-- src/transformers/training_args_tf.py | 2 +- tests/trainer/test_trainer.py | 83 +++++++++++++++++++++++++++ 6 files changed, 158 insertions(+), 40 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 64cb5c6bd4ddbb..4315e54a42fc2e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -117,9 +117,9 @@ EvalPrediction, HPSearchBackend, HubStrategy, - IntervalStrategy, PredictionOutput, RemoveColumnsCollator, + SaveStrategy, TrainerMemoryTracker, TrainOutput, check_target_module_exists, @@ -419,6 +419,12 @@ def __init__( raise ValueError( f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. " ) + if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end: + if args.metric_for_best_model is None: + raise ValueError( + "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`." + ) + self.args = args self.compute_loss_func = compute_loss_func # Seed must be set before instantiating the model when using model @@ -2998,9 +3004,13 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno metrics = None if self.control.should_evaluate: metrics = self._evaluate(trial, ignore_keys_for_eval) + is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial) + + if self.args.save_strategy == SaveStrategy.BEST: + self.control.should_save = is_new_best_metric if self.control.should_save: - self._save_checkpoint(model, trial, metrics=metrics) + self._save_checkpoint(model, trial) self.control = self.callback_handler.on_save(self.args, self.state, self.control) def _load_rng_state(self, checkpoint): @@ -3077,7 +3087,48 @@ def _load_rng_state(self, checkpoint): "\nThis won't yield the same results as if the training had not been interrupted." ) - def _save_checkpoint(self, model, trial, metrics=None): + def _determine_best_metric(self, metrics, trial): + """ + Determine if the model should be saved based on the evaluation metrics. + If args.metric_for_best_model is not set, the loss is used. + + Returns: + bool: True if a new best metric was found, else False + """ + is_new_best_metric = False + + if self.args.metric_for_best_model is not None: + metric_to_check = self.args.metric_for_best_model + + if not metric_to_check.startswith("eval_"): + metric_to_check = f"eval_{metric_to_check}" + + try: + metric_value = metrics[metric_to_check] + except KeyError as exc: + raise KeyError( + f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. " + f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments." + ) from exc + + operator = np.greater if self.args.greater_is_better else np.less + + if self.state.best_metric is None: + self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf") + + if operator(metric_value, self.state.best_metric): + run_dir = self._get_output_dir(trial=trial) + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + output_dir = os.path.join(run_dir, checkpoint_folder) + + self.state.best_metric = metric_value + self.state.best_model_checkpoint = output_dir + + is_new_best_metric = True + + return is_new_best_metric + + def _save_checkpoint(self, model, trial): # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we # want to save except FullyShardedDDP. # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model" @@ -3098,31 +3149,6 @@ def _save_checkpoint(self, model, trial, metrics=None): # Save RNG state self._save_rng_state(output_dir) - # Determine the new best metric / best model checkpoint - if metrics is not None and self.args.metric_for_best_model is not None: - metric_to_check = self.args.metric_for_best_model - if not metric_to_check.startswith("eval_"): - metric_to_check = f"eval_{metric_to_check}" - try: - metric_value = metrics[metric_to_check] - except KeyError as exc: - raise KeyError( - f"The `metric_for_best_model` training argument is set to '{metric_to_check}', " - f"which is not found in the evaluation metrics. " - f"The available evaluation metrics are: {list(metrics.keys())}. " - f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or " - f"consider changing the `metric_for_best_model` via the TrainingArguments." - ) from exc - - operator = np.greater if self.args.greater_is_better else np.less - if ( - self.state.best_metric is None - or self.state.best_model_checkpoint is None - or operator(metric_value, self.state.best_metric) - ): - self.state.best_metric = metric_value - self.state.best_model_checkpoint = output_dir - # Save the Trainer state if self.args.should_save: # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently @@ -4543,7 +4569,7 @@ def _push_from_checkpoint(self, checkpoint_folder): # Same for the training arguments torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) - if self.args.save_strategy == IntervalStrategy.STEPS: + if self.args.save_strategy == SaveStrategy.STEPS: commit_message = f"Training in progress, step {self.state.global_step}" else: commit_message = f"Training in progress, epoch {int(self.state.epoch)}" diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 405874acf8f4c4..ce9f2a26732c2e 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -24,7 +24,7 @@ import numpy as np from tqdm.auto import tqdm -from .trainer_utils import IntervalStrategy, has_length +from .trainer_utils import IntervalStrategy, SaveStrategy, has_length from .training_args import TrainingArguments from .utils import logging @@ -555,7 +555,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra # Save if ( - args.save_strategy == IntervalStrategy.STEPS + args.save_strategy == SaveStrategy.STEPS and state.save_steps > 0 and state.global_step % state.save_steps == 0 ): @@ -565,7 +565,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra if state.global_step >= state.max_steps: control.should_training_stop = True # Save the model at the end if we have a save strategy - if args.save_strategy != IntervalStrategy.NO: + if args.save_strategy not in [SaveStrategy.NO, SaveStrategy.BEST]: control.should_save = True return control @@ -580,7 +580,7 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr control.should_evaluate = True # Save - if args.save_strategy == IntervalStrategy.EPOCH: + if args.save_strategy == SaveStrategy.EPOCH: control.should_save = True return control diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 02c298cf7d2e65..42088cd730628d 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -227,6 +227,13 @@ class IntervalStrategy(ExplicitEnum): EPOCH = "epoch" +class SaveStrategy(ExplicitEnum): + NO = "no" + STEPS = "steps" + EPOCH = "epoch" + BEST = "best" + + class EvaluationStrategy(ExplicitEnum): NO = "no" STEPS = "steps" diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 485610dd9baa28..c98e8bc41b924d 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -33,6 +33,7 @@ FSDPOption, HubStrategy, IntervalStrategy, + SaveStrategy, SchedulerType, ) from .utils import ( @@ -349,12 +350,13 @@ class TrainingArguments: - save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): The checkpoint save strategy to adopt during training. Possible values are: - `"no"`: No save is done during training. - `"epoch"`: Save is done at the end of each epoch. - `"steps"`: Save is done every `save_steps`. + - `"best"`: Save is done whenever a new `best_metric` is achieved. If `"epoch"` or `"steps"` is chosen, saving will also be performed at the very end of training, always. @@ -962,7 +964,7 @@ class TrainingArguments: }, ) logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) - save_strategy: Union[IntervalStrategy, str] = field( + save_strategy: Union[SaveStrategy, str] = field( default="steps", metadata={"help": "The checkpoint save strategy to use."}, ) @@ -1580,7 +1582,7 @@ def __post_init__(self): self.eval_strategy = IntervalStrategy(self.eval_strategy) self.logging_strategy = IntervalStrategy(self.logging_strategy) - self.save_strategy = IntervalStrategy(self.save_strategy) + self.save_strategy = SaveStrategy(self.save_strategy) self.hub_strategy = HubStrategy(self.hub_strategy) self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) @@ -1616,7 +1618,7 @@ def __post_init__(self): if self.eval_steps != int(self.eval_steps): raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") self.eval_steps = int(self.eval_steps) - if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1: + if self.save_strategy == SaveStrategy.STEPS and self.save_steps > 1: if self.save_steps != int(self.save_steps): raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}") self.save_steps = int(self.save_steps) @@ -2750,8 +2752,8 @@ def set_save( 100 ``` """ - self.save_strategy = IntervalStrategy(strategy) - if self.save_strategy == IntervalStrategy.STEPS and steps == 0: + self.save_strategy = SaveStrategy(strategy) + if self.save_strategy == SaveStrategy.STEPS and steps == 0: raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.") self.save_steps = steps self.save_total_limit = total_limit diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 9df53c3f1d6161..3716a78879d501 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -114,7 +114,7 @@ class TFTrainingArguments(TrainingArguments): Whether to log and evaluate the first `global_step` or not. logging_steps (`int`, *optional*, defaults to 500): Number of update steps between two logs if `logging_strategy="steps"`. - save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): The checkpoint save strategy to adopt during training. Possible values are: - `"no"`: No save is done during training. diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 5c03355785d2b5..b6fe807fa4961a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -4041,6 +4041,89 @@ def test_trainer_saves_processor(self): reloaded_tokenizer(test_sentence, padding="max_length").input_ids, ) + def test_save_best_checkpoint(self): + freq = int(64 / self.batch_size) + total = int(self.n_epochs * 64 / self.batch_size) + + # Case 1: args.metric_for_best_model == "accuracy". + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + metric_for_best_model="accuracy", + compute_metrics=AlmostAccuracy(), + ) + self.assertTrue(trainer.args.metric_for_best_model == "accuracy") + + with patch.object( + trainer, + "_evaluate", + side_effect=[ + {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, + {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, + {"eval_loss": 0.01, "eval_accuracy": 0.64, "epoch": 3.0}, + ], + ): + trainer.train() + + self.assertEqual(len(os.listdir(tmpdir)), 2) + self.check_saved_checkpoints( + output_dir=tmpdir, + freq=freq, + total=total, + ) + + # Case 2: args.metric_for_best_model == "loss". + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + metric_for_best_model="loss", + compute_metrics=AlmostAccuracy(), + ) + self.assertTrue(trainer.args.metric_for_best_model == "loss") + + with patch.object( + trainer, + "_evaluate", + side_effect=[ + {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, + {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, + {"eval_loss": 0.03, "eval_accuracy": 0.66, "epoch": 3.0}, + ], + ): + trainer.train() + + self.assertEqual(len(os.listdir(tmpdir)), 2) + self.check_saved_checkpoints( + output_dir=tmpdir, + freq=freq, + total=total, + ) + + # Case 3: Metric name not provided; throw error. + with tempfile.TemporaryDirectory() as tmpdir: + with self.assertRaises(ValueError) as context: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + compute_metrics=AlmostAccuracy(), + ) + + self.assertIn("`args.metric_for_best_model` must be provided", str(context.exception)) + @require_torch @is_staging_test From fc1ae7f30f1d16c7652c28dd8d91c5d8a8ed2f15 Mon Sep 17 00:00:00 2001 From: Vijay Date: Mon, 28 Oct 2024 21:44:07 +0530 Subject: [PATCH 122/385] [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details (#34322) * [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details * [docs] correct input documentation for MISTRAL model to reference `input_ids` instead of `decoder_input_ids` * [docs] clarify cache_position description in MISTRAL model documentation --- src/transformers/models/mamba2/modeling_mamba2.py | 10 ++++++++++ src/transformers/models/mistral/modeling_mistral.py | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 110ae09a388704..c312b9b94351d2 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -805,6 +805,16 @@ class Mamba2CausalLMOutput(ModelOutput): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the current input in the cache. This is used to ensure that the cache is correctly updated. + If `cache_params` is passed, `cache_position` should also be passed. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) """ diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 321d3dc0daf378..3b0fb75a4cb3ba 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -619,7 +619,7 @@ def _init_weights(self, module): Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see `past_key_values`). If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] @@ -666,6 +666,10 @@ def _init_weights(self, module): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices indicating the position of the input sequence tokens in the sequence. Unlike `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. """ From 1f7539c829531810e96501156598ffeaee8cd7e7 Mon Sep 17 00:00:00 2001 From: wony617 <49024958+Jwaminju@users.noreply.github.com> Date: Tue, 29 Oct 2024 02:46:49 +0900 Subject: [PATCH 123/385] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?= =?UTF-8?q?=20`model=5Fdoc/barthez.md`=20to=20Korean=20(#33980)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: model_doc/barthez.md * feat: nmt draft --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/model_doc/barthez.md | 60 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/model_doc/barthez.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 27102f123dd871..51d54b697b2d82 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -322,8 +322,8 @@ title: (번역중) ALBERT - local: model_doc/bart title: BART - - local: in_translation - title: (번역중) BARThez + - local: model_doc/barthez + title: BARThez - local: model_doc/bartpho title: BARTpho - local: in_translation diff --git a/docs/source/ko/model_doc/barthez.md b/docs/source/ko/model_doc/barthez.md new file mode 100644 index 00000000000000..131db38856cc1b --- /dev/null +++ b/docs/source/ko/model_doc/barthez.md @@ -0,0 +1,60 @@ + + +# BARThez [[barthez]] + +## 개요 [[overview]] + +BARThez 모델은 2020년 10월 23일, Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis에 의해 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)에서 제안되었습니다. + +이 논문의 초록: + + +*자기지도 학습에 의해 가능해진 귀납적 전이 학습은 자연어 처리(NLP) 분야 전반에 걸쳐 큰 반향을 일으켰으며, +BERT와 BART와 같은 모델들은 수많은 자연어 이해 작업에서 새로운 최첨단 성과를 기록했습니다. 일부 주목할 만한 예외가 있지만, +대부분의 사용 가능한 모델과 연구는 영어에 집중되어 있었습니다. 본 연구에서는 BARThez를 소개합니다. +이는 (우리가 아는 한) 프랑스어를 위한 첫 번째 BART 모델입니다. +BARThez는 과거 연구에서 얻은 매우 큰 프랑스어 단일 언어 말뭉치로 사전훈련되었으며, +BART의 변형 방식에 맞게 조정되었습니다. +CamemBERT 및 FlauBERT와 같은 기존의 BERT 기반 프랑스어 모델과 달리, BARThez는 생성 작업에 특히 적합합니다. +이는 인코더뿐만 아니라 디코더도 사전훈련되었기 때문입니다. +우리는 FLUE 벤치마크에서의 판별 작업 외에도 이 논문과 함께 공개하는 새로운 요약 데이터셋인 OrangeSum에서 BARThez를 평가했습니다. +또한 이미 사전훈련된 다국어 BART의 사전훈련을 BARThez의 말뭉치로 계속 진행하였으며, +결과적으로 얻어진 모델인 mBARTHez가 기본 BARThez보다 유의미한 성능 향상을 보였고, +CamemBERT 및 FlauBERT와 동등하거나 이를 능가함을 보였습니다.* + +이 모델은 [moussakam](https://huggingface.co/moussakam)이 기여했습니다. 저자의 코드는 [여기](https://github.com/moussaKam/BARThez)에서 찾을 수 있습니다. + + + +BARThez 구현은 🤗 BART와 동일하나, 토큰화에서 차이가 있습니다. 구성 클래스와 그 매개변수에 대한 정보는 [BART 문서](bart)를 참조하십시오. +BARThez 전용 토크나이저는 아래에 문서화되어 있습니다. + + + +## 리소스 [[resources]] + +- BARThez는 🤗 BART와 유사한 방식으로 시퀀스-투-시퀀스 작업에 맞춰 미세 조정될 수 있습니다. 다음을 확인하세요: + [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md). + + +## BarthezTokenizer [[bartheztokenizer]] + +[[autodoc]] BarthezTokenizer + +## BarthezTokenizerFast [[bartheztokenizerfast]] + +[[autodoc]] BarthezTokenizerFast From 084e946cfdf4ecd37e8004db68018c042630c18e Mon Sep 17 00:00:00 2001 From: Shubham S Jagtap <63872951+ShubhamJagtap2000@users.noreply.github.com> Date: Mon, 28 Oct 2024 23:18:18 +0530 Subject: [PATCH 124/385] Apply linting to the important code blocks to make it readable (#34449) Enhance user experience using py-linting --- docs/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/README.md b/docs/README.md index 7dbcefc0483c66..bb54d7004130f2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -276,14 +276,14 @@ building the return. Here's an example of a single value return: -``` +```python Returns: `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token. ``` Here's an example of a tuple return, comprising several objects: -``` +```python Returns: `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- @@ -322,10 +322,9 @@ includes an example of how to transcribe speech to text in the The syntax for Example docstrings can look as follows: -``` +```python Example: - ```python >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC >>> from datasets import load_dataset >>> import torch @@ -347,7 +346,6 @@ The syntax for Example docstrings can look as follows: >>> transcription = processor.batch_decode(predicted_ids) >>> transcription[0] 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL' - ``` ``` The docstring should give a minimal, clear example of how the respective model From a17f287ac039f92835b5cd9bd8ee28b584c9f65e Mon Sep 17 00:00:00 2001 From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:54:37 +0300 Subject: [PATCH 125/385] [i18n-ar] Translated file : `docs/source/ar/fast_tokenizers.md` into Arabic (#33034) * Add docs/source/ar/fast_tokenizers.md to Add_docs_source_ar_fast_tokenizers.md * Update _toctree.yml * Update _toctree.yml * Update docs/source/ar/_toctree.yml Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --------- Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --- docs/source/ar/_toctree.yml | 8 ++--- docs/source/ar/fast_tokenizers.md | 51 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 docs/source/ar/fast_tokenizers.md diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 6f7899b53b854e..bd45925c64cb0e 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -108,9 +108,9 @@ # title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة # title: الإرشاد # title: أدلة المهام -# - sections: -# - local: fast_tokenizers -# title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers +- sections: + - local: fast_tokenizers + title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers # - local: multilingual # title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات # - local: create_a_model @@ -139,7 +139,7 @@ # title: استكشاف الأخطاء وإصلاحها # - local: gguf # title: التوافق مع ملفات GGUF -# title: أدلة المطورين + title: أدلة المطورين # - sections: # - local: quantization/overview # title: نظرة عامة diff --git a/docs/source/ar/fast_tokenizers.md b/docs/source/ar/fast_tokenizers.md new file mode 100644 index 00000000000000..539712969e813f --- /dev/null +++ b/docs/source/ar/fast_tokenizers.md @@ -0,0 +1,51 @@ +# استخدام مجزئيات النصوص من 🤗 Tokenizers + +يعتمد [`PreTrainedTokenizerFast`] على مكتبة [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). يمكن تحميل المجزئات اللغويين الذين تم الحصول عليهم من مكتبة 🤗 Tokenizers ببساطة شديدة في 🤗 Transformers. + +قبل الدخول في التفاصيل، دعونا نبدأ أولاً بإنشاء مُجزىء لغوي تجريبي في بضع سطور: + +```python +>>> from tokenizers import Tokenizer +>>> from tokenizers.models import BPE +>>> from tokenizers.trainers import BpeTrainer +>>> from tokenizers.pre_tokenizers import Whitespace + +>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + +>>> tokenizer.pre_tokenizer = Whitespace() +>>> files = [...] +>>> tokenizer.train(files, trainer) +``` + +الآن لدينا مُجزىء لغوي مدرب على الملفات التي حددناها. يمكننا إما الاستمرار في استخدامه في وقت التشغيل هذا، أو حفظه في ملف JSON لإعادة استخدامه لاحقًا. + +## تحميل مُجزئ النّصوص مُباشرةً + +دعونا نرى كيف يمكننا الاستفادة من كائن (مُجزئ النصوص) في مكتبة 🤗 Transformers. تسمح فئة [`PreTrainedTokenizerFast`] سهولة إنشاء *tokenizer*، من خلال قبول كائن *المُجزئ النصوص* مُهيّأ مُسبقًا كمعامل: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +``` + +يمكن الآن استخدام هذا الكائن مع جميع الطرق المُشتركة بين مُجزّئي النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النّصوص](main_classes/tokenizer) لمزيد من المعلومات. + +## التحميل من ملف JSON + +لتحميل مُجزّئ النص من ملف JSON، دعونا نبدأ أولاً بحفظ مُجزّئ النّصوص: + +```python +>>> tokenizer.save("tokenizer.json") +``` + +يمكن تمرير المسار الذي حفظنا به هذا الملف إلى طريقة تهيئة [`PreTrainedTokenizerFast`] باستخدام المُعامل `tokenizer_file`: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +``` + +يمكن الآن استخدام هذا الكائن مع جميع الطرق التي تشترك فيها مُجزّئي النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النص](main_classes/tokenizer) لمزيد من المعلومات. \ No newline at end of file From d21dbd1520937c993de1409215b1418bd6be74a1 Mon Sep 17 00:00:00 2001 From: kang sheng Date: Tue, 29 Oct 2024 01:59:38 +0800 Subject: [PATCH 126/385] enable average tokens across devices (#34373) * enable average tokens across devices * reduce earlier in case model needs it * simplify if statement * reformat code to make ruff happy * add doc for argument: average_tokens_across_devices * cannot find world size when pytorch is unavailable * format code --------- Co-authored-by: Zach Mueller Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/trainer.py | 10 +++++++++- src/transformers/training_args.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4315e54a42fc2e..9176bd72a55032 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3631,7 +3631,12 @@ def training_step( with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: - loss *= self.args.gradient_accumulation_steps + if num_items_in_batch is not None: + if self.compute_loss_func or self.model_accepts_loss_kwargs: + loss *= self.args.gradient_accumulation_steps + # Average tokens across devices is orthogonal to gradient accumulation + if self.args.average_tokens_across_devices: + loss *= self.args.world_size self.accelerator.backward(loss, **kwargs) return loss.detach() / self.args.gradient_accumulation_steps @@ -3646,6 +3651,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs.pop("labels") else: labels = None + if self.args.average_tokens_across_devices and num_items_in_batch is not None: + num_items_in_batch_tensor = torch.tensor(num_items_in_batch, device=self.args.device) + num_items_in_batch = int(self.accelerator.gather(num_items_in_batch_tensor).sum().cpu()) if self.model_accepts_loss_kwargs: loss_kwargs = {} if num_items_in_batch is not None: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index c98e8bc41b924d..3e5c6cc2f37428 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1532,6 +1532,15 @@ class TrainingArguments: }, ) + average_tokens_across_devices: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether or not to average tokens across devices. If enabled, will use all_reduce to " + "synchronize num_tokens_in_batch for precise loss calculation. Reference: " + "https://github.com/huggingface/transformers/issues/34242" + }, + ) + def __post_init__(self): # Parse in args that could be `dict` sent in from the CLI as a string for field in _VALID_DICT_FIELDS: @@ -1765,6 +1774,19 @@ def __post_init__(self): if self.framework == "pt" and is_torch_available(): self.device + # Disable average tokens when using single device + if self.average_tokens_across_devices: + try: + if self.world_size == 1: + logger.warning( + "average_tokens_across_devices is set to True but it is invalid when world size is" + "1. Turn it to False automatically." + ) + self.average_tokens_across_devices = False + except ImportError as e: + logger.warning(f"Can not specify world size due to {e}. Turn average_tokens_across_devices to False.") + self.average_tokens_across_devices = False + if self.torchdynamo is not None: warnings.warn( "`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" From 6cc4a67b3d22445cd17e26922ba4435a5e97f759 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 28 Oct 2024 19:33:17 +0100 Subject: [PATCH 127/385] feat: run benchmarks on A100 (#34287) --- .github/workflows/benchmark.yml | 9 +- benchmark/grafana_dashboard.json | 1593 ++++++++++++++++-------------- benchmark/llama.py | 4 + 3 files changed, 885 insertions(+), 721 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 79f0652e192f2a..a65b8cafe562ec 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,8 +16,11 @@ env: jobs: benchmark: name: Benchmark + strategy: + matrix: + group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus] runs-on: - group: aws-g5-4xlarge-cache + group: ${{ matrix.group }} if: | (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )|| (github.event_name == 'push' && github.ref == 'refs/heads/main') @@ -60,9 +63,13 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) + df -h python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + # Enable this to see debug logs + # HF_HUB_VERBOSITY: debug + # TRANSFORMERS_VERBOSITY: debug PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} PGUSER: transformers_benchmarks PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json index 2375663ffbc6db..3d579f7b368711 100644 --- a/benchmark/grafana_dashboard.json +++ b/benchmark/grafana_dashboard.json @@ -39,7 +39,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -77,7 +77,7 @@ "properties": [ { "id": "custom.width", - "value": 364 + "value": 196 } ] }, @@ -101,7 +101,7 @@ "properties": [ { "id": "custom.width", - "value": 708 + "value": 581 } ] }, @@ -113,7 +113,7 @@ "properties": [ { "id": "custom.width", - "value": 388 + "value": 379 } ] } @@ -148,7 +148,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = '${branch}';", + "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -232,7 +232,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -312,7 +312,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -334,6 +334,19 @@ } ], "title": "First eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -341,7 +354,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -424,7 +437,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -446,6 +459,19 @@ } ], "title": "Second eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -466,7 +492,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -545,7 +571,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -567,6 +593,19 @@ } ], "title": "Time to first token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -574,7 +613,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -653,7 +692,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -675,6 +714,19 @@ } ], "title": "Time to second token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -682,7 +734,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -761,7 +813,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -783,6 +835,19 @@ } ], "title": "Time to third token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -790,7 +855,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -869,7 +934,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -891,6 +956,19 @@ } ], "title": "Time to subsequent next tokens mean", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -911,7 +989,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -990,7 +1068,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1012,6 +1090,19 @@ } ], "title": "First compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1019,7 +1110,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1098,7 +1189,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1120,6 +1211,19 @@ } ], "title": "Second compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1127,7 +1231,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1206,7 +1310,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1228,6 +1332,19 @@ } ], "title": "Third compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1235,7 +1352,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1314,7 +1431,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1336,11 +1453,24 @@ } ], "title": "Fourth compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, @@ -1348,751 +1478,753 @@ "y": 64 }, "id": 15, - "panels": [], - "title": "Usage metrics", - "type": "row" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "panels": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 65 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "CPU Utilization", + "transparent": true, + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "cpu_util", - "type": "functionParameter" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "mem_megabytes", - "type": "functionParameter" + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "gpu_util", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" }, - { - "parameters": [ + "table": "measurements" + } + ], + "title": "GPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "\"time\"", - "type": "functionParameter" + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "CPU Utilization", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 65 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" - }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ - { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "mem_megabytes", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" - }, - { - "parameters": [ - { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ - { - "name": "gpu_mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ - { - "name": "\"time\"", - "type": "functionParameter" - } - ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "GPU Utilization", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "whereString": "commit_id = '${commit}'" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 74 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "table": "measurements" + } + ], + "title": "Memory usage", + "transparent": true, + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ - { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "parameters": [ - { - "name": "mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - { - "parameters": [ + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "color": "green", + "value": null + }, { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ - { - "name": "\"time\"", - "type": "functionParameter" - } - ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "Memory usage", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "unit": "decmbytes" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 74 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" - }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "\"time\"", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU memory usage", + "transparent": true, + "type": "timeseries" } ], - "title": "GPU memory usage", - "transparent": true, - "type": "timeseries" + "title": "Usage metrics", + "type": "row" } ], + "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -2105,7 +2237,7 @@ }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT DISTINCT branch FROM benchmarks;", "description": "", @@ -2125,12 +2257,12 @@ { "current": { "selected": false, - "text": "1728662868776", - "value": "1728662868776" + "text": "1729701492845", + "value": "1729701492845" }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", "description": "", @@ -2149,12 +2281,12 @@ { "current": { "selected": false, - "text": "1728663254125", - "value": "1728663254125" + "text": "1730120430069", + "value": "1730120430069" }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", "description": "", @@ -2164,7 +2296,7 @@ "name": "EndTime", "options": [], "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2178,7 +2310,7 @@ }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", "hide": 0, @@ -2188,11 +2320,32 @@ "name": "gpu_name", "options": [], "query": "SELECT DISTINCT gpu_name FROM benchmarks;", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" + }, + { + "current": { + "selected": false, + "text": "10", + "value": "10" + }, + "description": "The number of commits to display, going from most recent to the nth commit.", + "hide": 0, + "label": "Last # of commits", + "name": "last_n_commits", + "options": [ + { + "selected": true, + "text": "10", + "value": "10" + } + ], + "query": "10", + "skipUrlSync": false, + "type": "textbox" } ] }, @@ -2206,6 +2359,6 @@ "timezone": "browser", "title": "Transformers benchmarks", "uid": "fdz33iyzln9c0a", - "version": 11, + "version": 4, "weekStart": "" } diff --git a/benchmark/llama.py b/benchmark/llama.py index a926f903486607..4a2c57422e6ffb 100644 --- a/benchmark/llama.py +++ b/benchmark/llama.py @@ -96,17 +96,21 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge ) conn.commit() benchmark_id = cur.fetchone()[0] + logger.info(f"running benchmark #{benchmark_id} on {gpu_name}") metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) metrics_thread.start() + logger.info("started background thread to fetch device metrics") os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling device = "cuda" ckpt = "meta-llama/Llama-2-7b-hf" + logger.info("downloading weights") # This is to avoid counting download in model load time measurement model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) + logger.info("loading model") start = perf_counter() model = AutoModelForCausalLM.from_pretrained( ckpt, torch_dtype=torch.float16, generation_config=gen_config From a769ed45e17c44fd17b85c025863c4e4f2f73634 Mon Sep 17 00:00:00 2001 From: Alexandros Benetatos <34627055+alex-bene@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:44:20 +0200 Subject: [PATCH 128/385] Add `post_process_depth_estimation` for GLPN (#34413) * add depth postprocessing for GLPN * remove previous temp fix for glpn tests * Style changes for GLPN's `post_process_depth_estimation` Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * additional style fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../models/glpn/image_processing_glpn.py | 54 ++++++++++++++++++- src/transformers/models/glpn/modeling_glpn.py | 16 +++--- tests/models/glpn/test_modeling_glpn.py | 8 --- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 9e69c8ae8a6e7a..115cefc86beec3 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -14,7 +14,11 @@ # limitations under the License. """Image processor class for GLPN.""" -from typing import List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput import numpy as np import PIL.Image @@ -27,12 +31,17 @@ get_image_size, infer_channel_dimension_format, is_scaled_image, + is_torch_available, make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, logging +from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends + + +if is_torch_available(): + import torch logger = logging.get_logger(__name__) @@ -218,3 +227,44 @@ def preprocess( data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = depth[None, None, ...] + depth = torch.nn.functional.interpolate(depth, size=target_size, mode="bicubic", align_corners=False) + depth = depth.squeeze() + + results.append({"predicted_depth": depth}) + + return results diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index 9fd22ca0f7be95..70f175df8c9973 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -723,20 +723,18 @@ def forward( >>> with torch.no_grad(): ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], ... ) >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 254c1135357147..81e95ab244f9aa 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -157,14 +157,6 @@ def setUp(self): self.model_tester = GLPNModelTester(self) self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig) - @unittest.skip(reason="Failing after #32550") - def test_pipeline_depth_estimation(self): - pass - - @unittest.skip(reason="Failing after #32550") - def test_pipeline_depth_estimation_fp16(self): - pass - def test_config(self): self.config_tester.run_common_tests() From fe76b603702c7ae7ee4acafd1bc8a7ed80d61950 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 07:54:51 +0100 Subject: [PATCH 129/385] LLaVA: latency issues (#34460) * fix llavas * code style * green ci --- .../models/llava/modeling_llava.py | 127 ++++++++-------- .../models/llava_next/modeling_llava_next.py | 135 +++++++++--------- .../modeling_llava_next_video.py | 13 +- .../modular_llava_next_video.py | 13 +- .../video_llava/modeling_video_llava.py | 13 +- .../models/vipllava/modeling_vipllava.py | 123 ++++++++-------- 6 files changed, 186 insertions(+), 238 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 0b2492fc711206..a0079f1787a2e9 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -472,6 +472,7 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None: image_features = self.get_image_features( pixel_values=pixel_values, @@ -479,69 +480,67 @@ def forward( vision_feature_select_strategy=vision_feature_select_strategy, ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels ) - # prefill stage vs decoding stage (legacy behavior copied) - if input_ids.shape[1] != 1: - inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( - image_features, inputs_embeds, input_ids, attention_mask, labels - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - # Get the target length - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) - - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] - - # TODO: @raushan retain only the new behavior after v4.47 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) else: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -602,12 +601,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - # Trigger the new behavior if we have more than image embeddings seq length tokens for images - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -618,7 +611,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values"] = pixel_values diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 0cbda9cfd64b74..5a49337b2b5d96 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -846,6 +846,7 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None and pixel_values.size(0) > 0: image_features = self.get_image_features( pixel_values, @@ -861,74 +862,73 @@ def forward( vision_feature_select_strategy=vision_feature_select_strategy, image_newline=self.image_newline, ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + if input_ids.shape[1] != 1: + inputs_embeds = inputs_embeds.to(image_features.dtype) + inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features( + image_features, + feature_lens, + inputs_embeds, + input_ids, + attention_mask, + position_ids, + labels=labels, ) - if input_ids.shape[1] != 1: - inputs_embeds = inputs_embeds.to(image_features.dtype) - inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features( - image_features, - feature_lens, - inputs_embeds, - input_ids, - attention_mask, - position_ids, - labels=labels, - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - # Get the target length - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) + else: + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - # TODO: @raushan retain only the new behavior after v4.47 - else: - n_image_tokens = (input_ids == self.config.image_token_index).sum().item() - n_image_features = image_features.shape[0] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -990,11 +990,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -1007,7 +1002,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 96f4373afd9ec6..44b372535d70bd 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1110,17 +1110,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- extra custom processing - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -1133,7 +1122,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["pixel_values_videos"] = pixel_values_videos model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index c1ed7571941b9e..e9974e920493ff 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -623,17 +623,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- extra custom processing - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -646,7 +635,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["pixel_values_videos"] = pixel_values_videos model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index a9bd8b745a6f68..30f82e45056c77 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -720,17 +720,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values_images is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -741,7 +730,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values_images"] = pixel_values_images diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 987ae0ad0c61fe..c9db6e261c6a72 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -466,72 +466,71 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None: image_features = self.get_image_features( pixel_values=pixel_values, vision_feature_layers=vision_feature_layers ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in VipLLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in VipLLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels ) - # prefill stage vs decoding stage (legacy behavior copied) - if input_ids.shape[1] != 1: - inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( - image_features, inputs_embeds, input_ids, attention_mask, labels - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) - - # Filter out only the tokens that can be un-attended, this can happen - # in the case one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] - - # TODO: @raushan retain only the new behavior after v4.47 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) else: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Filter out only the tokens that can be un-attended, this can happen + # in the case one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -590,12 +589,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - # Trigger the new behavior if we have more than image embeddings seq length tokens for images - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -606,7 +599,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values"] = pixel_values From 808d6c50f8c6911d972f27bb5155c04e513c99ee Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 07:57:10 +0100 Subject: [PATCH 130/385] Generation: fix test (#34369) * fix test * fix copies --- tests/generation/test_utils.py | 44 +++++-------------- tests/models/idefics/test_modeling_idefics.py | 3 +- tests/models/mamba2/test_modeling_mamba2.py | 3 +- tests/models/moshi/test_modeling_moshi.py | 22 +++++----- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 6f2eaf734df14f..d552bf73442ce7 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -671,29 +671,6 @@ def test_beam_sample_generate(self): else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) - # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly - # no quick fix available, since obtaining image embeddings step is very model-specific - if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")): - prepare_inputs_for_generation_args = set( - inspect.signature(model.prepare_inputs_for_generation).parameters - ) - # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling - # code is up to date with our most recent standards - if ( - "inputs_embeds" in prepare_inputs_for_generation_args - and "cache_positions" in prepare_inputs_for_generation_args - ): - input_embeds = model.get_input_embeddings()(inputs_dict["input_ids"]) - beam_kwargs.update({"inputs_embeds": input_embeds}) - output_generate2 = self._beam_sample_generate( - model=model, - input_ids=None, - inputs_dict={}, - beam_kwargs=beam_kwargs, - ) - - torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2) - @pytest.mark.generate def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: @@ -1570,7 +1547,8 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: @@ -1597,11 +1575,15 @@ def test_generate_from_inputs_embeds_decoder_only(self): continue input_ids = inputs_dict.pop("input_ids") + generation_kwargs = { + "return_dict_in_generate": True, + "output_scores": True, + "num_beams": num_beams, + "do_sample": False, + } # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) + outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) @@ -1610,8 +1592,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, ) self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) @@ -1622,15 +1603,14 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=random_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, ) for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs ) self.assertListEqual( outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index bbade169550f8c..c2f0ef8ccd01d3 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -773,7 +773,8 @@ def test_custom_4d_attention_mask(self): @unittest.skip( reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" ) - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): pass @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index f19358a22f4b31..1a8cf04774531f 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -204,7 +204,8 @@ def test_generate_without_input_ids(self): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index dd9302ee2c55ba..b77a6ff10364ca 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -656,16 +656,21 @@ def test_initialization(self): ) @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): for model_class in self.all_generative_model_classes: config, input_ids, _, inputs_dict = self._get_input_ids_and_config() model = model_class(config).to(torch_device).eval() + generation_kwargs = { + "return_dict_in_generate": True, + "output_scores": True, + "num_beams": num_beams, + "do_sample": False, + } # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True, **inputs_dict - ) + outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) @@ -674,8 +679,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) @@ -686,8 +690,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=random_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) for i in range(len(outputs_from_rand_embeds.scores)): @@ -697,8 +700,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): outputs_from_embeds_wo_ids = model.generate( inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) self.assertListEqual( From 63ca6d9771b13b603deb228420623681188a4dc2 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 08:26:04 +0100 Subject: [PATCH 131/385] Fix CI (#34458) * fix * fix mistral --- src/transformers/generation/flax_utils.py | 2 ++ tests/generation/test_flax_utils.py | 4 ++++ tests/test_modeling_common.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 08480ac983e805..88535b44e9c479 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -397,6 +397,8 @@ def generate( "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + else: # by default let's always generate 10 new tokens + generation_config.max_length = generation_config.max_length + input_ids_seq_length if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py index 647482b88cd83f..bb0c1828763bb6 100644 --- a/tests/generation/test_flax_utils.py +++ b/tests/generation/test_flax_utils.py @@ -101,6 +101,10 @@ def test_greedy_generate_pt_fx(self): pt_model = pt_model_class(config).eval() pt_model = load_flax_weights_in_pytorch_model(pt_model, flax_model.params) + # Generate max 5 tokens only otherwise seems to be numerical error accumulation + pt_model.generation_config.max_length = 5 + flax_model.generation_config.max_length = 5 + flax_generation_outputs = flax_model.generate(input_ids).sequences pt_generation_outputs = pt_model.generate(torch.tensor(input_ids, dtype=torch.long)) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 51d51dfcc2825c..d88b0dc5f02f83 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3002,7 +3002,7 @@ def test_inputs_embeds_matches_input_ids(self): def test_inputs_embeds_matches_input_ids_with_generate(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: + for model_class in self.all_generative_model_classes: if model_class.__name__ not in [ *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), From 655bec2da7120a8681acc2ce951f8d58c6f0e6ef Mon Sep 17 00:00:00 2001 From: kang sheng Date: Tue, 29 Oct 2024 16:39:06 +0800 Subject: [PATCH 132/385] use a tinymodel to test generation config which aviod timeout (#34482) * use a tinymodel to test generation config which aviod timeout * remove tailing whitespace --- tests/utils/test_modeling_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 8af47cde8e5315..0452a10d5d57e6 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -1544,15 +1544,16 @@ def test_pretrained_low_mem_new_config(self): self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__) def test_generation_config_is_loaded_with_model(self): - # Note: `TinyLlama/TinyLlama-1.1B-Chat-v1.0` has a `generation_config.json` containing `max_length: 2048` + # Note: `hf-internal-testing/tiny-random-MistralForCausalLM` has a `generation_config.json` + # containing `bos_token_id: 1` # 1. Load without further parameters - model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") - self.assertEqual(model.generation_config.max_length, 2048) + model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL) + self.assertEqual(model.generation_config.bos_token_id, 1) # 2. Load with `device_map` - model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto") - self.assertEqual(model.generation_config.max_length, 2048) + model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL, device_map="auto") + self.assertEqual(model.generation_config.bos_token_id, 1) @require_safetensors def test_safetensors_torch_from_torch(self): From a1835195d134f5a244aed1212342be94fa27b40c Mon Sep 17 00:00:00 2001 From: StevenBucaille Date: Tue, 29 Oct 2024 10:36:03 +0100 Subject: [PATCH 133/385] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20?= =?UTF-8?q?[SuperPoint]=20Fix=20keypoint=20coordinate=20output=20and=20add?= =?UTF-8?q?=20post=20processing=20(#33200)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Added int conversion and unwrapping * test: added tests for post_process_keypoint_detection of SuperPointImageProcessor * docs: changed docs to include post_process_keypoint_detection method and switched from opencv to matplotlib * test: changed test to not depend on SuperPointModel forward * test: added missing require_torch decorator * docs: changed pyplot parameters for the keypoints to be more visible in the example * tests: changed import torch location to make test_flax and test_tf * Revert "tests: changed import torch location to make test_flax and test_tf" This reverts commit 39b32a2f69500bc7af01715fc7beae2260549afe. * tests: fixed import * chore: applied suggestions from code review Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * tests: fixed import * tests: fixed import (bis) * tests: fixed import (ter) * feat: added choice of type for target_size and changed tests accordingly * docs: updated code snippet to reflect the addition of target size type choice in post process method * tests: fixed imports (...) * tests: fixed imports (...) * style: formatting file * docs: fixed typo from image[0] to image.size[0] * docs: added output image and fixed some tests * Update docs/source/en/model_doc/superpoint.md Co-authored-by: Pavel Iakubovskii * fix: included SuperPointKeypointDescriptionOutput in TYPE_CHECKING if statement and changed tests results to reflect changes to SuperPoint from absolute keypoints coordinates to relative * docs: changed SuperPoint's docs to print output instead of just accessing * style: applied make style * docs: added missing output type and precision in docstring of post_process_keypoint_detection * perf: deleted loop to perform keypoint conversion in one statement * fix: moved keypoint conversion at the end of model forward * docs: changed SuperPointInterestPointDecoder to SuperPointKeypointDecoder class name and added relative (x, y) coordinates information to its method * fix: changed type hint * refactor: removed unnecessary brackets * revert: SuperPointKeypointDecoder to SuperPointInterestPointDecoder * Update docs/source/en/model_doc/superpoint.md Co-authored-by: Pavel Iakubovskii --------- Co-authored-by: Steven Bucaille Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii --- docs/source/en/model_doc/superpoint.md | 37 +++++++----- .../superpoint/image_processing_superpoint.py | 59 ++++++++++++++++++- .../models/superpoint/modeling_superpoint.py | 10 +++- .../test_image_processing_superpoint.py | 54 ++++++++++++++++- .../superpoint/test_modeling_superpoint.py | 10 ++-- 5 files changed, 147 insertions(+), 23 deletions(-) diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index b9aab2f1b929f2..59e451adceb817 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -86,24 +86,32 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup inputs = processor(images, return_tensors="pt") outputs = model(**inputs) - -for i in range(len(images)): - image_mask = outputs.mask[i] - image_indices = torch.nonzero(image_mask).squeeze() - image_keypoints = outputs.keypoints[i][image_indices] - image_scores = outputs.scores[i][image_indices] - image_descriptors = outputs.descriptors[i][image_indices] +image_sizes = [(image.height, image.width) for image in images] +outputs = processor.post_process_keypoint_detection(outputs, image_sizes) + +for output in outputs: + for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]): + print(f"Keypoints: {keypoints}") + print(f"Scores: {scores}") + print(f"Descriptors: {descriptors}") ``` -You can then print the keypoints on the image to visualize the result : +You can then print the keypoints on the image of your choice to visualize the result: ```python -import cv2 -for keypoint, score in zip(image_keypoints, image_scores): - keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item()) - color = tuple([score.item() * 255] * 3) - image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color) -cv2.imwrite("output_image.png", image) +import matplotlib.pyplot as plt + +plt.axis("off") +plt.imshow(image_1) +plt.scatter( + outputs[0]["keypoints"][:, 0], + outputs[0]["keypoints"][:, 1], + c=outputs[0]["scores"] * 100, + s=outputs[0]["scores"] * 50, + alpha=0.8 +) +plt.savefig(f"output_image.png") ``` +![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png) This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille). The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork). @@ -123,6 +131,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] SuperPointImageProcessor - preprocess +- post_process_keypoint_detection ## SuperPointForKeypointDetection diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index fbbb717570cb70..65309b1c1826f2 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -13,11 +13,11 @@ # limitations under the License. """Image processor class for SuperPoint.""" -from typing import Dict, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np -from ... import is_vision_available +from ... import is_torch_available, is_vision_available from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( @@ -32,6 +32,12 @@ from ...utils import TensorType, logging, requires_backends +if is_torch_available(): + import torch + +if TYPE_CHECKING: + from .modeling_superpoint import SuperPointKeypointDescriptionOutput + if is_vision_available(): import PIL @@ -270,3 +276,52 @@ def preprocess( data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_keypoint_detection( + self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]] + ) -> List[Dict[str, "torch.Tensor"]]: + """ + Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors + with coordinates absolute to the original image sizes. + + Args: + outputs ([`SuperPointKeypointDescriptionOutput`]): + Raw outputs of the model containing keypoints in a relative (x, y) format, with scores and descriptors. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. This must be the original + image size (before any processing). + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in absolute format according + to target_sizes, scores and descriptors for an image in the batch as predicted by the model. + """ + if len(outputs.mask) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask") + + if isinstance(target_sizes, List): + image_sizes = torch.tensor(target_sizes) + else: + if target_sizes.shape[1] != 2: + raise ValueError( + "Each element of target_sizes must contain the size (h, w) of each image of the batch" + ) + image_sizes = target_sizes + + # Flip the image sizes to (width, height) and convert keypoints to absolute coordinates + image_sizes = torch.flip(image_sizes, [1]) + masked_keypoints = outputs.keypoints * image_sizes[:, None] + + # Convert masked_keypoints to int + masked_keypoints = masked_keypoints.to(torch.int32) + + results = [] + for image_mask, keypoints, scores, descriptors in zip( + outputs.mask, masked_keypoints, outputs.scores, outputs.descriptors + ): + indices = torch.nonzero(image_mask).squeeze(1) + keypoints = keypoints[indices] + scores = scores[indices] + descriptors = descriptors[indices] + results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors}) + + return results diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index cfd3dfd86e8ee9..1075de299a9f40 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -239,7 +239,10 @@ def _get_pixel_scores(self, encoded: torch.Tensor) -> torch.Tensor: return scores def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation""" + """ + Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation. + The keypoints are in the form of relative (x, y) coordinates. + """ _, height, width = scores.shape # Threshold keypoints by score value @@ -447,7 +450,7 @@ def forward( pixel_values = self.extract_one_channel_pixel_values(pixel_values) - batch_size = pixel_values.shape[0] + batch_size, _, height, width = pixel_values.shape encoder_outputs = self.encoder( pixel_values, @@ -485,6 +488,9 @@ def forward( descriptors[i, : _descriptors.shape[0]] = _descriptors mask[i, : _scores.shape[0]] = 1 + # Convert to relative coordinates + keypoints = keypoints / torch.tensor([width, height], device=keypoints.device) + hidden_states = encoder_outputs[1] if output_hidden_states else None if not return_dict: return tuple(v for v in [loss, keypoints, scores, descriptors, mask, hidden_states] if v is not None) diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py index 90bbf82d1ed80a..c2eae872004c77 100644 --- a/tests/models/superpoint/test_image_processing_superpoint.py +++ b/tests/models/superpoint/test_image_processing_superpoint.py @@ -16,7 +16,7 @@ import numpy as np from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.utils import is_torch_available, is_vision_available from ...test_image_processing_common import ( ImageProcessingTestMixin, @@ -24,6 +24,11 @@ ) +if is_torch_available(): + import torch + + from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput + if is_vision_available(): from transformers import SuperPointImageProcessor @@ -70,6 +75,23 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F torchify=torchify, ) + def prepare_keypoint_detection_output(self, pixel_values): + max_number_keypoints = 50 + batch_size = len(pixel_values) + mask = torch.zeros((batch_size, max_number_keypoints)) + keypoints = torch.zeros((batch_size, max_number_keypoints, 2)) + scores = torch.zeros((batch_size, max_number_keypoints)) + descriptors = torch.zeros((batch_size, max_number_keypoints, 16)) + for i in range(batch_size): + random_number_keypoints = np.random.randint(0, max_number_keypoints) + mask[i, :random_number_keypoints] = 1 + keypoints[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 2)) + scores[i, :random_number_keypoints] = torch.rand((random_number_keypoints,)) + descriptors[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 16)) + return SuperPointKeypointDescriptionOutput( + loss=None, keypoints=keypoints, scores=scores, descriptors=descriptors, mask=mask, hidden_states=None + ) + @require_torch @require_vision @@ -110,3 +132,33 @@ def test_input_image_properly_converted_to_grayscale(self): pre_processed_images = image_processor.preprocess(image_inputs) for image in pre_processed_images["pixel_values"]: self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])) + + @require_torch + def test_post_processing_keypoint_detection(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + image_inputs = self.image_processor_tester.prepare_image_inputs() + pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt") + outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images) + + def check_post_processed_output(post_processed_output, image_size): + for post_processed_output, image_size in zip(post_processed_output, image_size): + self.assertTrue("keypoints" in post_processed_output) + self.assertTrue("descriptors" in post_processed_output) + self.assertTrue("scores" in post_processed_output) + keypoints = post_processed_output["keypoints"] + all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all( + keypoints[:, 1] <= image_size[0] + ) + all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0) + self.assertTrue(all_below_image_size) + self.assertTrue(all_above_zero) + + tuple_image_sizes = [(image.size[0], image.size[1]) for image in image_inputs] + tuple_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tuple_image_sizes) + + check_post_processed_output(tuple_post_processed_outputs, tuple_image_sizes) + + tensor_image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1) + tensor_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tensor_image_sizes) + + check_post_processed_output(tensor_post_processed_outputs, tensor_image_sizes) diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index 25c384a7955793..8db435502ca565 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -260,7 +260,7 @@ def test_inference(self): inputs = preprocessor(images=images, return_tensors="pt").to(torch_device) with torch.no_grad(): outputs = model(**inputs) - expected_number_keypoints_image0 = 567 + expected_number_keypoints_image0 = 568 expected_number_keypoints_image1 = 830 expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1) expected_keypoints_shape = torch.Size((len(images), expected_max_number_keypoints, 2)) @@ -275,11 +275,13 @@ def test_inference(self): self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape) self.assertEqual(outputs.scores.shape, expected_scores_shape) self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape) - expected_keypoints_image0_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]]).to(torch_device) + expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188], [0.7719, 0.0188], [0.7641, 0.0333]]).to( + torch_device + ) expected_scores_image0_values = torch.tensor( - [0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334] + [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335] ).to(torch_device) - expected_descriptors_image0_value = torch.tensor(-0.1096).to(torch_device) + expected_descriptors_image0_value = torch.tensor(-0.1095).to(torch_device) predicted_keypoints_image0_values = outputs.keypoints[0, :3] predicted_scores_image0_values = outputs.scores[0, :9] predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0] From 439334c8fb4edf11314dc94c72dda868f87a0808 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:48:57 +0100 Subject: [PATCH 134/385] Simplify running tests in a subprocess (#34213) * check * check * check * check * add docstring --------- Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 40 +++++++++++++++++++ .../models/imagegpt/test_modeling_imagegpt.py | 8 ++-- .../video_llava/test_modeling_video_llava.py | 13 ++++-- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 2781e9e102e050..0eef286732d81c 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -2366,6 +2366,46 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None): test_case.fail(f'{results["error"]}') +def run_test_using_subprocess(func): + """ + To decorate a test to run in a subprocess using the `subprocess` module. This could avoid potential GPU memory + issues (GPU OOM or a test that causes many subsequential failing with `CUDA error: device-side assert triggered`). + """ + import pytest + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if os.getenv("_INSIDE_SUB_PROCESS", None) == "1": + func(*args, **kwargs) + else: + test = " ".join(os.environ.get("PYTEST_CURRENT_TEST").split(" ")[:-1]) + try: + import copy + + env = copy.deepcopy(os.environ) + env["_INSIDE_SUB_PROCESS"] = "1" + + # If not subclass of `unitTest.TestCase` and `pytestconfig` is used: try to grab and use the arguments + if "pytestconfig" in kwargs: + command = list(kwargs["pytestconfig"].invocation_params.args) + for idx, x in enumerate(command): + if x in kwargs["pytestconfig"].args: + test = test.split("::")[1:] + command[idx] = "::".join([f"{func.__globals__['__file__']}"] + test) + command = [f"{sys.executable}", "-m", "pytest"] + command + command = [x for x in command if x not in ["--no-summary"]] + # Otherwise, simply run the test with no option at all + else: + command = [f"{sys.executable}", "-m", "pytest", f"{test}"] + + subprocess.run(command, env=env, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + exception_message = e.stdout.decode() + raise pytest.fail(exception_message, pytrace=False) + + return wrapper + + """ The following contains utils to run the documentation tests without having to overwrite any files. diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 079726755289fe..cdbe815431f319 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -18,7 +18,7 @@ import unittest from transformers import ImageGPTConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, run_test_using_subprocess, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...generation.test_utils import GenerationTesterMixin @@ -257,11 +257,9 @@ def _check_scores(self, batch_size, scores, length, config): self.assertEqual(len(scores), length) self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores)) - @unittest.skip( - reason="After #33632, this test still passes, but many subsequential tests fail with `device-side assert triggered`" - ) + @run_test_using_subprocess def test_beam_search_generate_dict_outputs_use_cache(self): - pass + super().test_beam_search_generate_dict_outputs_use_cache() def setUp(self): self.model_tester = ImageGPTModelTester(self) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 1bd01843981deb..fd4c49f4a6966d 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -28,7 +28,14 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_gpu, + run_test_using_subprocess, + slow, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -248,9 +255,7 @@ def test_flash_attn_2_fp32_ln(self): def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): pass - @unittest.skip( - reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`" - ) + @run_test_using_subprocess def test_mixed_input(self): config, inputs = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: From 626c610a4d9d36427d392e0ed70a5c7018900eba Mon Sep 17 00:00:00 2001 From: Martin Gubri <1850174+Framartin@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:10:10 +0100 Subject: [PATCH 135/385] Fix perplexity computation in perplexity.md (#34387) fix average NLL in perplexity.md --- docs/source/en/perplexity.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md index 7555619fe488d2..ac7ef8504e72b6 100644 --- a/docs/source/en/perplexity.md +++ b/docs/source/en/perplexity.md @@ -107,7 +107,8 @@ max_length = model.config.n_positions stride = 512 seq_len = encodings.input_ids.size(1) -nlls = [] +nll_sum = 0.0 +n_tokens = 0 prev_end_loc = 0 for begin_loc in tqdm(range(0, seq_len, stride)): end_loc = min(begin_loc + max_length, seq_len) @@ -124,13 +125,19 @@ for begin_loc in tqdm(range(0, seq_len, stride)): # to the left by 1. neg_log_likelihood = outputs.loss - nlls.append(neg_log_likelihood) + # Accumulate the total negative log-likelihood and the total number of tokens + num_valid_tokens = (target_ids != -100).sum().item() # number of valid tokens in target_ids + batch_size = target_ids.size(0) + num_loss_tokens = num_valid_tokens - batch_size # subtract batch_size due to internal label shift + nll_sum += neg_log_likelihood * num_loss_tokens + n_tokens += num_loss_tokens prev_end_loc = end_loc if end_loc == seq_len: break -ppl = torch.exp(torch.stack(nlls).mean()) +avg_nll = nll_sum / n_tokens # average negative log-likelihood per token +ppl = torch.exp(avg_nll) ``` Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window @@ -139,5 +146,5 @@ and the better the reported perplexity will typically be. When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is +strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is closer to the true autoregressive decomposition of a sequence likelihood. From 9e3d704e2340fe9b306b5bd6b12605e4341c012b Mon Sep 17 00:00:00 2001 From: hlky Date: Tue, 29 Oct 2024 10:40:41 +0000 Subject: [PATCH 136/385] Fixes for Modular Converter on Windows (#34266) * Separator in regex * Standardize separator for relative path in auto generated message * open() encoding * Replace `\` on `os.path.abspath` --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- utils/modular_model_converter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index c107a483186231..bda143c2577279 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -56,7 +56,7 @@ def get_module_source_from_name(module_name: str) -> str: if spec is None or spec.origin is None: return f"Module {module_name} not found" - with open(spec.origin, "r") as file: + with open(spec.origin, "r", encoding="utf-8") as file: source_code = file.read() return source_code @@ -1132,7 +1132,7 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, if pattern is not None: model_name = pattern.groups()[0] # Parse the Python file - with open(modular_file, "r") as file: + with open(modular_file, "r", encoding="utf-8") as file: code = file.read() module = cst.parse_module(code) wrapper = MetadataWrapper(module) @@ -1143,7 +1143,7 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, if node != {}: # Get relative path starting from src/transformers/ relative_path = re.search( - rf"(src{os.sep}transformers{os.sep}.*|examples{os.sep}.*)", os.path.abspath(modular_file) + r"(src/transformers/.*|examples/.*)", os.path.abspath(modular_file).replace("\\", "/") ).group(1) header = AUTO_GENERATED_MESSAGE.format( @@ -1164,7 +1164,7 @@ def save_modeling_file(modular_file, converted_file): [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")] ) if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0: - with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f: + with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f: f.write(converted_file[file_type][0]) else: non_comment_lines = len( @@ -1172,7 +1172,7 @@ def save_modeling_file(modular_file, converted_file): ) if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0: logger.warning("The modeling code contains errors, it's written without formatting") - with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f: + with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f: f.write(converted_file[file_type][1]) From 004530aa050efcdd489f1ac6809626fa578636ad Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 29 Oct 2024 19:41:04 +0900 Subject: [PATCH 137/385] Fix regression loading dtype (#34409) * fix regression * add test for torchao * expected output * better fix --- src/transformers/modeling_utils.py | 9 +++++---- .../torchao_integration/test_torchao.py | 20 +++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a6fbd7b1a91453..8481fa7df9cd96 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -943,13 +943,14 @@ def _load_state_dict_into_meta_model( old_param = model splits = param_name.split(".") for split in splits: - old_param = getattr(old_param, split) - # Not all the attributes of a module are Parameters/Tensor - if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): - old_param = None + # We shouldn't hit the default value unless for quant methods like hqq that modifies expected_keys. + old_param = getattr(old_param, split, None) if old_param is None: break + if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): + old_param = None + if old_param is not None: if dtype is None: param = param.to(old_param.dtype) diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index 8014f745d08688..c7c701e49aec14 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -208,6 +208,26 @@ def test_int4wo_offload(self): self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) + def test_int8_dynamic_activation_int8_weight_quant(self): + """ + Simple LLM model testing int8_dynamic_activation_int8_weight + """ + quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight") + + # Note: we quantize the bfloat16 model on the fly to int4 + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=torch_device, + quantization_config=quant_config, + ) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) + if __name__ == "__main__": unittest.main() From 5392f12e1614383270ae8df524415a1f6b555773 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 06:30:02 -0700 Subject: [PATCH 138/385] Bert is ExecuTorch compatible (#34424) Co-authored-by: Guang Yang --- tests/models/bert/test_modeling_bert.py | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 8ac1c3d2b409d0..aa9835d8cd67c1 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -16,6 +16,8 @@ import tempfile import unittest +from packaging import version + from transformers import AutoTokenizer, BertConfig, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import ( @@ -749,3 +751,43 @@ def test_sdpa_ignored_mask(self): self.assertTrue( torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4) ) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + bert_model = "google-bert/bert-base-uncased" + device = "cpu" + attn_implementation = "sdpa" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(bert_model) + inputs = tokenizer( + "the man worked as a [MASK].", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = BertForMaskedLM.from_pretrained( + bert_model, + device_map=device, + attn_implementation=attn_implementation, + use_cache=True, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "barber", "mechanic", "salesman"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From 8755dd26b7e5ac25987a03627d317624dcdad2a1 Mon Sep 17 00:00:00 2001 From: Doohae Jung <80743307+wavy-jung@users.noreply.github.com> Date: Tue, 29 Oct 2024 22:31:36 +0900 Subject: [PATCH 139/385] manual `head_dim` for `mixtral` model (#34281) --- .../models/mixtral/configuration_mixtral.py | 4 ++++ src/transformers/models/mixtral/modeling_mixtral.py | 13 ++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 164988b4dc524e..686c214ef25ce5 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -53,6 +53,8 @@ class MixtralConfig(PretrainedConfig): converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): + The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. max_position_embeddings (`int`, *optional*, defaults to `4096*32`): @@ -116,6 +118,7 @@ def __init__( num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=8, + head_dim=None, hidden_act="silu", max_position_embeddings=4096 * 32, initializer_range=0.02, @@ -154,6 +157,7 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.attention_dropout = attention_dropout + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 78a17178ecdda8..de1cd1097a53ff 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -283,7 +283,7 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads + self.head_dim = config.head_dim self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings @@ -291,11 +291,6 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): self.is_causal = True self.attention_dropout = config.attention_dropout - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) @@ -374,7 +369,7 @@ def forward( ) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = attn_output.reshape(bsz, q_len, -1) attn_output = self.o_proj(attn_output) @@ -481,7 +476,7 @@ def forward( is_causal=self.is_causal, ) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: @@ -575,7 +570,7 @@ def forward( ) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) + attn_output = attn_output.view(bsz, q_len, -1) attn_output = self.o_proj(attn_output) From 0ab0a4265131536d7422c57d0cc74c2afee1afd9 Mon Sep 17 00:00:00 2001 From: Shijie <821898965@qq.com> Date: Tue, 29 Oct 2024 22:27:34 +0800 Subject: [PATCH 140/385] fix-qwen2vl-no-position_ids (#33487) --- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 90bf29c8b5d66a..17e722a217dfd6 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1719,6 +1719,9 @@ def forward( if attention_mask is not None: attention_mask = attention_mask.to(inputs_embeds.device) + if position_ids is None and input_ids is not None: + position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) + outputs = self.model( input_ids=None, position_ids=position_ids, From 56c45d575786de60acba02838fb2b0d1176b4ff7 Mon Sep 17 00:00:00 2001 From: Abhijit Deo <72816663+abhi-glitchhg@users.noreply.github.com> Date: Tue, 29 Oct 2024 20:39:18 +0530 Subject: [PATCH 141/385] Bug fix for drop path decay rate in swin transformer (#34291) * potential bug fix for drop path * variable name change * forgot to rename the variables * back to original * modify dpr properly * check_copies auto fix * corresponsing swin2 changes * auto fix * linting * default value for drop_path_rate as 0.0 * Update src/transformers/models/glm/modeling_glm.py * maskformer fix * ruff format * changes made to tf code as well * lint --------- Co-authored-by: abhijit deo <167164474+deo-abhijit@users.noreply.github.com> --- src/transformers/models/clap/modeling_clap.py | 5 +++-- .../models/donut/modeling_donut_swin.py | 5 +++-- .../models/maskformer/modeling_maskformer_swin.py | 7 +++---- src/transformers/models/swin/modeling_swin.py | 5 +++-- src/transformers/models/swin/modeling_tf_swin.py | 14 +++++++++++--- .../models/swin2sr/modeling_swin2sr.py | 6 ++++-- src/transformers/models/swinv2/modeling_swinv2.py | 7 +++++-- 7 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index d0224e3caa5b28..f422b17b204f13 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -575,7 +575,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio class ClapAudioLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -583,7 +583,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = ClapAudioIntermediate(config, dim) self.output = ClapAudioOutput(config, dim) @@ -712,6 +712,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 8d639131b841ca..2d5272e8642ee5 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -558,7 +558,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin class DonutSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -566,7 +566,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = DonutSwinIntermediate(config, dim) self.output = DonutSwinOutput(config, dim) @@ -695,6 +695,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 9a40e050459816..598e1d8186a24a 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -520,16 +520,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MaskFormerSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.shift_size = shift_size self.window_size = config.window_size self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = MaskFormerSwinAttention(config, dim, num_heads, self.window_size) - self.drop_path = ( - MaskFormerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() - ) + self.drop_path = MaskFormerSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = MaskFormerSwinIntermediate(config, dim) self.output = MaskFormerSwinOutput(config, dim) @@ -644,6 +642,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 45383a36d9bea8..23f0ba6da620cd 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -635,7 +635,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class SwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -643,7 +643,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = SwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = SwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = SwinIntermediate(config, dim) self.output = SwinOutput(config, dim) @@ -771,6 +771,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index 035b31e8d43b80..f1aa0bfef743ad 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -742,7 +742,14 @@ def build(self, input_shape=None): class TFSwinLayer(keras.layers.Layer): def __init__( - self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs + self, + config, + dim, + input_resolution: Tuple[int, int], + num_heads: int, + drop_path_rate: float = 0.0, + shift_size: int = 0, + **kwargs, ) -> None: super().__init__(**kwargs) self.chunk_size_feed_forward = config.chunk_size_feed_forward @@ -754,8 +761,8 @@ def __init__( self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") self.attention = TFSwinAttention(config, dim, num_heads, name="attention") self.drop_path = ( - TFSwinDropPath(config.drop_path_rate, name="drop_path") - if config.drop_path_rate > 0.0 + TFSwinDropPath(drop_path_rate, name="drop_path") + if drop_path_rate > 0.0 else keras.layers.Activation("linear", name="drop_path") ) self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") @@ -913,6 +920,7 @@ def __init__( input_resolution=input_resolution, num_heads=num_heads, shift_size=0 if (i % 2 == 0) else config.window_size // 2, + drop_path_rate=drop_path[i], name=f"blocks.{i}", ) for i in range(depth) diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index b0a773c8af3472..d6bd8da9bed638 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -482,7 +482,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swinv2.modeling_swinv2.Swinv2Layer with Swinv2->Swin2SR class Swin2SRLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0): + def __init__( + self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0 + ): super().__init__() self.input_resolution = input_resolution window_size, shift_size = self._compute_window_shift( @@ -500,7 +502,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr else (pretrained_window_size, pretrained_window_size), ) self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) - self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = Swin2SRDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.intermediate = Swin2SRIntermediate(config, dim) self.output = Swin2SROutput(config, dim) self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 0c30e739a48f91..191923958cfbde 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -683,7 +683,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Swinv2Layer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0): + def __init__( + self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0 + ): super().__init__() self.input_resolution = input_resolution window_size, shift_size = self._compute_window_shift( @@ -701,7 +703,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr else (pretrained_window_size, pretrained_window_size), ) self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) - self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = Swinv2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.intermediate = Swinv2Intermediate(config, dim) self.output = Swinv2Output(config, dim) self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) @@ -819,6 +821,7 @@ def __init__( dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, pretrained_window_size=pretrained_window_size, ) From 34620e8f0a974761debf52093968107c14f41315 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:14:31 -0700 Subject: [PATCH 142/385] MobileBERT is ExecuTorch compatible (#34473) Co-authored-by: Guang Yang --- .../mobilebert/test_modeling_mobilebert.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index d7a409427c9c51..d2bc11d09f1797 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -16,7 +16,9 @@ import unittest -from transformers import MobileBertConfig, is_torch_available +from packaging import version + +from transformers import AutoTokenizer, MobileBertConfig, MobileBertForMaskedLM, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device @@ -384,3 +386,42 @@ def test_inference_no_head(self): upper_bound = torch.all((expected_slice / output[..., :3, :3]) <= 1 + TOLERANCE) self.assertTrue(lower_bound and upper_bound) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + mobilebert_model = "google/mobilebert-uncased" + device = "cpu" + attn_implementation = "eager" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(mobilebert_model) + inputs = tokenizer( + f"the man worked as a {tokenizer.mask_token}.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = MobileBertForMaskedLM.from_pretrained( + mobilebert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "mechanic", "teacher", "clerk"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From f339042b0b8bdc0b57a70d37f67cafbea960a2ab Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:22:13 -0700 Subject: [PATCH 143/385] Albert is ExecuTorch compatible (#34476) Co-authored-by: Guang Yang --- tests/models/albert/test_modeling_albert.py | 46 ++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index d1e5631b342d33..970f1dd8555e47 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -16,7 +16,9 @@ import unittest -from transformers import AlbertConfig, is_torch_available +from packaging import version + +from transformers import AlbertConfig, AutoTokenizer, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import require_torch, slow, torch_device @@ -342,3 +344,45 @@ def test_inference_no_head_absolute_embedding(self): ) self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + distilbert_model = "albert/albert-base-v2" + device = "cpu" + attn_implementation = "sdpa" + max_length = 64 + + tokenizer = AutoTokenizer.from_pretrained(distilbert_model) + inputs = tokenizer( + f"Paris is the {tokenizer.mask_token} of France.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = AlbertForMaskedLM.from_pretrained( + distilbert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices) + self.assertEqual( + eg_predicted_mask.split(), + ["capital", "capitol", "comune", "arrondissement", "bastille"], + ) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From e9ad46049411624bb1b6e830fbc1138991c0135e Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Tue, 29 Oct 2024 11:23:16 -0400 Subject: [PATCH 144/385] Adding `optimizer_cls_and_kwargs` to `Trainer.__init__` (#34358) * Adding `optimizer_cls_and_kwargs` to `Trainer.__init__` * formatting * make fix-copies docstring * added more docs for optimizer_cls_and_kwargs * add docs for Trainer(optimizer_cls_and_kwargs) * reverting anchor names --- docs/source/en/trainer.md | 106 +++++++++++++++++++++++------------- src/transformers/trainer.py | 18 +++++- 2 files changed, 82 insertions(+), 42 deletions(-) diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index f9ea3337699444..7bee3472892727 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -252,7 +252,70 @@ trainer = Trainer(..., args=training_args) NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior. -## GaLore +## Liger Kernel + +[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. + + +Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) + + +First make sure to install Liger official repository: +```bash +pip install liger-kernel +``` + +You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: + +```py +from transformers import TrainingArguments + +training_args = TrainingArguments( + output_dir="your-model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + use_liger_kernel=True +) +``` + +The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. + + +## Optimizers + +You can choose a built-in optimizer for training using: + +```python +from transformers import TrainingArguments +training_args = TrainingArguments(..., optim="adamw_torch") +``` + +See [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) for a full list of choices. We include advanced examples in the sections below. + +You can also use an arbitrary PyTorch optimizer via: + +```python +import torch + +optimizer_cls = torch.optim.AdamW +optimizer_kwargs = { + "lr": 4e-3, + "betas": (0.9, 0.999), + "weight_decay": 0.05, +} + +from transformers import Trainer +trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs)) +``` + +### GaLore Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA. @@ -382,42 +445,7 @@ trainer.train() Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue. -## Liger Kernel - -[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. - - -Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) - - -First make sure to install Liger official repository: -```bash -pip install liger-kernel -``` - -You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: - -```py -from transformers import TrainingArguments - -training_args = TrainingArguments( - output_dir="your-model", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=2, - weight_decay=0.01, - eval_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - push_to_hub=True, - use_liger_kernel=True -) -``` - -The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. - -## LOMO optimizer +### LOMO optimizer The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`. @@ -467,7 +495,7 @@ trainer = trl.SFTTrainer( trainer.train() ``` -## GrokAdamW optimizer +### GrokAdamW optimizer The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`. @@ -518,7 +546,7 @@ trainer.train() This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training. -## Schedule Free Optimizer +### Schedule Free Optimizer The Schedule Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682). Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule. diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 9176bd72a55032..e2ae622e2b6bf3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -34,7 +34,7 @@ import warnings from collections.abc import Mapping from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union # Integrations must be imported before ML frameworks: @@ -358,6 +358,11 @@ class Trainer: optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*): + A tuple containing the optimizer class and keyword arguments to use. + Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument. + + Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): A function that preprocess the logits right before caching them at each evaluation step. Must take two tensors, the logits and the labels, and return the logits once processed as desired. The modifications made @@ -401,7 +406,8 @@ def __init__( compute_loss_func: Optional[Callable] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): if args is None: @@ -603,6 +609,9 @@ def __init__( self.compute_metrics = compute_metrics self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.optimizer, self.lr_scheduler = optimizers + self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs + if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None: + raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.") if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None): raise RuntimeError( "Passing a `model_init` is incompatible with providing the `optimizers` argument. " @@ -1171,7 +1180,10 @@ def create_optimizer(self): }, ] - optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model) + if self.optimizer_cls_and_kwargs is not None: + optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs + else: + optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model) # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs` # e.g. for GaLore optimizer. From 4e2e8809ff4a596f7a9398c04293804a01cbe1d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:42:40 +0000 Subject: [PATCH 145/385] Bump werkzeug from 3.0.3 to 3.0.6 in /examples/research_projects/decision_transformer (#34420) Bump werkzeug in /examples/research_projects/decision_transformer Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.0.3 to 3.0.6. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/3.0.3...3.0.6) --- updated-dependencies: - dependency-name: werkzeug dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../research_projects/decision_transformer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index a54f3d03cab21b..6d42c3256a83e9 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -233,7 +233,7 @@ urllib3==1.26.19 wasabi==0.9.0 wcwidth==0.2.5 websocket-client==1.3.1 -Werkzeug==3.0.3 +Werkzeug==3.0.6 wrapt==1.14.0 xxhash==3.0.0 yarl==1.7.2 From f55595b177d66c3f8d9d43c8cccd52d263580bd1 Mon Sep 17 00:00:00 2001 From: Aleksey Lobanov Date: Tue, 29 Oct 2024 20:29:24 +0300 Subject: [PATCH 146/385] Fix performance in get_imports regexp (#34298) * fix: Fix performance in get_imports regexp * Minimize get_imports content regexp --- src/transformers/dynamic_module_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 4e0e1dd3430209..bf44d4b427cf7b 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -152,7 +152,8 @@ def get_imports(filename: Union[str, os.PathLike]) -> List[str]: content = f.read() # filter out try/except block so in custom code we can have try/except imports - content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL) + content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL) + # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment content = re.sub( r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE From e4449bb790db5cb27a6cb36581991cf5e75209a9 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:08:42 -0400 Subject: [PATCH 147/385] fix incorrect warning (#34416) --- src/transformers/processing_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b363f8c72cc481..286ca49de85706 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -874,12 +874,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key in kwargs: - if key not in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): - logger.warning_once( - f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." - ) - elif key not in used_keys: - output_kwargs["common_kwargs"][key] = kwargs[key] + if key not in used_keys: + if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): + output_kwargs["common_kwargs"][key] = kwargs[key] + else: + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) # all modality-specific kwargs are updated with common kwargs for modality in output_kwargs: From 9bee9ff5db6e68fb31065898d7e924d07c1eb9c1 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 29 Oct 2024 18:45:14 +0000 Subject: [PATCH 148/385] Un-deprecate timeout arg in pipelines (#34382) * Un-deprecate timeout * Put "timeout" on the allowed list * make fixup --- src/transformers/pipelines/depth_estimation.py | 7 +++---- src/transformers/pipelines/image_classification.py | 7 +++---- src/transformers/pipelines/image_segmentation.py | 7 +++---- src/transformers/pipelines/image_to_text.py | 8 ++++---- src/transformers/pipelines/object_detection.py | 7 +++---- .../pipelines/zero_shot_image_classification.py | 7 ++++--- tests/test_pipeline_mixin.py | 7 +++++++ 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py index ae86c552a720af..2203ac09c9cf9b 100644 --- a/src/transformers/pipelines/depth_estimation.py +++ b/src/transformers/pipelines/depth_estimation.py @@ -1,4 +1,3 @@ -import warnings from typing import List, Union from ..utils import ( @@ -72,6 +71,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag A dictionary of argument names to parameter values, to control pipeline behaviour. The only parameter available right now is `timeout`, which is the length of time, in seconds, that the pipeline should wait before giving up on trying to download an image. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a @@ -93,9 +95,6 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs): preprocess_params = {} if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout if isinstance(parameters, dict) and "timeout" in parameters: preprocess_params["timeout"] = parameters["timeout"] diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py index 20ad72e79055e2..0085e5eb73f826 100644 --- a/src/transformers/pipelines/image_classification.py +++ b/src/transformers/pipelines/image_classification.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import List, Union import numpy as np @@ -113,9 +112,6 @@ def __init__(self, *args, **kwargs): def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None): preprocess_params = {} if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout postprocess_params = {} if top_k is not None: @@ -159,6 +155,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag top_k (`int`, *optional*, defaults to 5): The number of top labels that will be returned by the pipeline. If the provided number is higher than the number of labels available in the model configuration, it will default to the number of labels. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 0ac653fd1e8725..d388e591bf9df4 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Dict, List, Union import numpy as np @@ -91,9 +90,6 @@ def _sanitize_parameters(self, **kwargs): if "overlap_mask_area_threshold" in kwargs: postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"] if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_kwargs["timeout"] = kwargs["timeout"] return preprocess_kwargs, {}, postprocess_kwargs @@ -122,6 +118,9 @@ def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction] Threshold to use when turning the predicted masks into binary values. overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5): Mask overlap threshold to eliminate small, disconnected segments. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py index 4beaa481920054..0d37ce91dadc89 100644 --- a/src/transformers/pipelines/image_to_text.py +++ b/src/transformers/pipelines/image_to_text.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import List, Union from ..utils import ( @@ -81,9 +80,6 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt if prompt is not None: preprocess_params["prompt"] = prompt if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout if max_new_tokens is not None: @@ -118,6 +114,10 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag generate_kwargs (`Dict`, *optional*): Pass it to send all of these arguments directly to `generate` allowing full control of this function. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: A list or a list of list of `dict`: Each result comes as a dictionary with the following key: diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py index c135b1e131acb9..c84f17b2bd6ad0 100644 --- a/src/transformers/pipelines/object_detection.py +++ b/src/transformers/pipelines/object_detection.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Dict, List, Union from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends @@ -64,9 +63,6 @@ def __init__(self, *args, **kwargs): def _sanitize_parameters(self, **kwargs): preprocess_params = {} if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = kwargs["timeout"] postprocess_kwargs = {} if "threshold" in kwargs: @@ -89,6 +85,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]: same format: all as HTTP(S) links, all as local paths, or all as PIL images. threshold (`float`, *optional*, defaults to 0.5): The probability necessary to make a prediction. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 253c684fcbbdad..c53b515dcccd9c 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -94,6 +94,10 @@ def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are already formatted. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: A list of dictionaries containing one entry per proposed label. Each dictionary contains the following keys: @@ -113,9 +117,6 @@ def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): if "candidate_labels" in kwargs: preprocess_params["candidate_labels"] = kwargs["candidate_labels"] if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = kwargs["timeout"] if "hypothesis_template" in kwargs: preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index fe8a197237291a..f079bcdd92e580 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -916,6 +916,8 @@ def parse_args_from_docstring_by_indentation(docstring): def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec): + ALLOWED_TRANSFORMERS_ONLY_ARGS = ["timeout"] + docstring = inspect.getdoc(pipeline_class.__call__).strip() docstring_args = set(parse_args_from_docstring_by_indentation(docstring)) hub_args = set(get_arg_names_from_hub_spec(hub_spec)) @@ -933,6 +935,11 @@ def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec): hub_args.remove(js_generate_args[0]) docstring_args.remove(docstring_generate_args[0]) + # Special casing 2: We permit some transformers-only arguments that don't affect pipeline output + for arg in ALLOWED_TRANSFORMERS_ONLY_ARGS: + if arg in docstring_args and arg not in hub_args: + docstring_args.remove(arg) + if hub_args != docstring_args: error = [f"{pipeline_class.__name__} differs from JS spec {hub_spec.__name__}"] matching_args = hub_args & docstring_args From cd277618d4dbcafff108739e46584fd0a5c8f872 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Wed, 30 Oct 2024 01:36:45 -0700 Subject: [PATCH 149/385] Roberta is ExecuTorch compatible (#34425) * Roberta is ExecuTorch compatible * [run_slow] roberta --------- Co-authored-by: Guang Yang --- tests/models/roberta/test_modeling_roberta.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index ca557937803cff..1c128513b17d13 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -16,7 +16,7 @@ import unittest -from transformers import RobertaConfig, is_torch_available +from transformers import AutoTokenizer, RobertaConfig, is_torch_available from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin @@ -41,6 +41,7 @@ RobertaEmbeddings, create_position_ids_from_input_ids, ) + from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" @@ -576,3 +577,43 @@ def test_inference_classification_head(self): # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach() self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4)) + + @slow + def test_export(self): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + roberta_model = "FacebookAI/roberta-base" + device = "cpu" + attn_implementation = "sdpa" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(roberta_model) + inputs = tokenizer( + "The goal of life is .", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = RobertaForMaskedLM.from_pretrained( + roberta_model, + device_map=device, + attn_implementation=attn_implementation, + use_cache=True, + ) + + logits = model(**inputs).logits + eager_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eager_predicted_mask.split(), ["happiness", "love", "peace", "freedom", "simplicity"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + exported_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eager_predicted_mask, exported_predicted_mask) From 25a9fc584acb09afecb08b6cfd74e705058bf2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peth=C5=91=20Gergely?= Date: Wed, 30 Oct 2024 10:03:41 +0100 Subject: [PATCH 150/385] Fix format mistake in string repr of tokenizer objects (#34493) * fix repr string format for tokenizer objects The repr of tokenizer tokens looks confusing and just stupid, like this: `Tokenizer(...), added_tokens_decoder={1: ..., 2: ...}`. The dict that is the value of the added_tokens_decoder attribute is outside of the parentheses of the tokenizer object, whereas all other attributes are inside the parentheses like they should be. This commit fixes this bug. * cos: add newline before closing parenthesis of repr string --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4f3187d510fad1..89ab2dc9260819 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1687,8 +1687,8 @@ def __repr__(self) -> str: f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," - f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), " - " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}" + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}," + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)" ) def __len__(self) -> int: From 0f764a5af77de9a8f74521ec2237dc79feb40a71 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 30 Oct 2024 10:11:50 +0100 Subject: [PATCH 151/385] Mllama: update docs (#34334) * update docs * be more explicit * use avaialble methods --- docs/source/en/model_doc/mllama.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index 9cb038ed2e3453..4a6080ea2ce03a 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -30,6 +30,25 @@ The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted. - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor. + + + +Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them. + +Otherwise if you see CUDA-side index erros when generating, use the below code to expand the `lm_head` by one more token. + + +```python +old_embeddings = model.get_output_embeddings() + +num_tokens = model.vocab_size + 1 +resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=num_tokens, mean_resizing=True) +resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad) +model.set_output_embeddings(resized_embeddings) +``` + + + ## Usage Example #### Instruct model From 913330ca9f80b0a308d7490a02274b01b51e6051 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 30 Oct 2024 10:21:37 +0100 Subject: [PATCH 152/385] VLMs: fix number of image tokens (#34332) * fix * fix tests * add tests * style * style * fix qwen after rebase * fix video llava --- .../models/chameleon/modeling_chameleon.py | 2 +- .../models/llava/modeling_llava.py | 5 +-- .../modeling_llava_next_video.py | 1 + .../modular_llava_next_video.py | 1 + .../modeling_llava_onevision.py | 2 ++ .../models/qwen2_vl/modeling_qwen2_vl.py | 5 +-- .../video_llava/modeling_video_llava.py | 8 ++--- .../models/vipllava/modeling_vipllava.py | 4 +-- tests/models/llava/test_modeling_llava.py | 29 +++++++++++++++ .../llava_next/test_modeling_llava_next.py | 32 +++++++++++++++++ .../test_modeling_llava_next_video.py | 32 +++++++++++++++++ .../paligemma/test_modeling_paligemma.py | 30 ++++++++++++++++ .../models/qwen2_vl/test_modeling_qwen2_vl.py | 36 ++++++++++++++++++- .../video_llava/test_modeling_video_llava.py | 35 ++++++++++++++++-- .../models/vipllava/test_modeling_vipllava.py | 30 ++++++++++++++++ 15 files changed, 237 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 797908277930cf..0661da8727996f 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1288,7 +1288,7 @@ def forward( if pixel_values is not None: image_tokens = self.get_image_tokens(pixel_values) n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item() - n_image_features = image_tokens.shape[0] + n_image_features = image_tokens.shape[0] * image_tokens.shape[1] if n_image_tokens_in_text != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}" diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index a0079f1787a2e9..6d6bf4a6f38e3f 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -527,8 +527,9 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 elif image_features is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 44b372535d70bd..c40ee1f70f900c 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1020,6 +1020,7 @@ def forward( if image_features is not None: n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e9974e920493ff..1425a017dc0558 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -533,6 +533,7 @@ def forward( if image_features is not None: n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 946688bfcf07f4..f8bdb5bf8d5a7c 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -679,6 +679,7 @@ def forward( ) n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" @@ -704,6 +705,7 @@ def forward( ) video_features = torch.cat((video_features, image_newline), dim=1) video_features = video_features.flatten(0, 1) + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() n_video_features = video_features.shape[0] if n_video_tokens != n_video_features: diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 17e722a217dfd6..9c0d0b45ee8e51 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1503,13 +1503,14 @@ def get_rope_index( mrope_position_deltas = [] if image_grid_thw is not None or video_grid_thw is not None: total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) position_ids = torch.ones( 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device ) image_index, video_index = 0, 0 for i, input_ids in enumerate(total_input_ids): - if attention_mask is not None: - input_ids = input_ids[attention_mask[i] == 1] + input_ids = input_ids[attention_mask[i] == 1] image_nums, video_nums = 0, 0 vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) vision_tokens = input_ids[vision_start_indices + 1] diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 30f82e45056c77..02efc7c344f7b8 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -628,8 +628,8 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: if pixel_values_images is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" @@ -644,8 +644,8 @@ def forward( inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) if pixel_values_videos is not None: - n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item() - n_video_features = video_features.shape[1] + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] * video_features.shape[1] if n_video_tokens != n_video_features: raise ValueError( f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index c9db6e261c6a72..4060f8c8ecd1bf 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -517,8 +517,8 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 elif image_features is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 405fad1bd31c8d..1a17f18de34234 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -235,6 +235,35 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 6589bf14d24c65..e088b2505366f6 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -283,6 +283,38 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 05fc8a49e1e9b9..edf1dd2d4c07a4 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -303,6 +303,38 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index cfc2a2c29b1d70..95ae59dfc08fca 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -236,6 +236,36 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 956243dccebebf..e1cd715f8f1d34 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -58,7 +58,7 @@ class Qwen2VLVisionText2TextModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, seq_length=7, num_channels=3, ignore_index=-100, @@ -245,6 +245,40 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + patch_size = config.vision_config.patch_size + one_img_length = (self.model_tester.image_size**2) // (patch_size**2) + input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...] + input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:one_img_length] + image_grid_thw = input_dict["image_grid_thw"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index fd4c49f4a6966d..e25ad1d44460c7 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -123,9 +123,9 @@ def __init__( self.batch_size = 5 self.num_channels = 3 self.image_size = 224 - self.encoder_seq_length = 64 + self.encoder_seq_length = 246 self.num_image_tokens = 25 - self.num_video_tokens = 26 + self.num_video_tokens = 26 * self.num_frames self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens def get_config(self): @@ -267,7 +267,7 @@ def test_mixed_input(self): # if we remove some images from inputs leaving only one # image number mismatch error should raise inputs["pixel_values_images"] = inputs["pixel_values_images"][:1] - with self.assertRaises(RuntimeError): + with self.assertRaises(ValueError): _ = model(**inputs) def test_video_only_input(self): @@ -401,6 +401,35 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values_images"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values_images=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values_images=pixel_values) + @require_torch class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 2c241c23f26158..a976e3cb51f54d 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -217,6 +217,36 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) From 8a734ea2c340beee23e665601919814918bf4c43 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 30 Oct 2024 10:59:08 +0000 Subject: [PATCH 153/385] Tests: move `generate` tests to the right mixin and delete redundant tests (#34464) * tmp commit * tmp commit * cull overwrites of deleted tests * typo * more specific docstring * make fixup * parameterize at the top? * correction * more deletions :D * tmp commit * for VLMs too * fix _check_outputs * test nit * make fixup * fix another flaky * test_generate_from_inputs_embeds -- handle missing attention mask --- src/transformers/generation/utils.py | 33 +- .../modeling_llava_next_video.py | 3 +- .../modular_llava_next_video.py | 3 +- .../modeling_llava_onevision.py | 3 +- .../models/musicgen/modeling_musicgen.py | 4 +- .../modeling_musicgen_melody.py | 4 +- .../video_llava/modeling_video_llava.py | 3 +- tests/generation/test_utils.py | 377 +++++++++------- tests/models/bart/test_modeling_bart.py | 5 - tests/models/bert/test_modeling_bert.py | 5 - .../chameleon/test_modeling_chameleon.py | 40 -- tests/models/gemma/test_modeling_gemma.py | 48 -- tests/models/gemma2/test_modeling_gemma2.py | 1 - tests/models/glm/test_modeling_glm.py | 40 -- tests/models/gptj/test_modeling_gptj.py | 45 +- tests/models/granite/test_modeling_granite.py | 47 +- .../granitemoe/test_modeling_granitemoe.py | 45 -- tests/models/idefics/test_modeling_idefics.py | 7 - .../models/idefics2/test_modeling_idefics2.py | 45 -- .../models/idefics3/test_modeling_idefics3.py | 78 ---- tests/models/jamba/test_modeling_jamba.py | 87 ---- tests/models/jetmoe/test_modeling_jetmoe.py | 80 ---- tests/models/kosmos2/test_modeling_kosmos2.py | 6 - tests/models/llama/test_modeling_llama.py | 41 -- tests/models/mamba2/test_modeling_mamba2.py | 10 +- tests/models/mimi/test_modeling_mimi.py | 17 - tests/models/mistral/test_modeling_mistral.py | 80 ---- tests/models/mixtral/test_modeling_mixtral.py | 80 ---- tests/models/mllama/test_modeling_mllama.py | 1 - tests/models/moshi/test_modeling_moshi.py | 61 +-- tests/models/mt5/test_modeling_mt5.py | 3 - .../models/musicgen/test_modeling_musicgen.py | 281 ------------ .../test_modeling_musicgen_melody.py | 143 ------ .../models/nemotron/test_modeling_nemotron.py | 2 - .../paligemma/test_modeling_paligemma.py | 4 - tests/models/phi/test_modeling_phi.py | 41 -- tests/models/qwen2/test_modeling_qwen2.py | 80 ---- .../qwen2_moe/test_modeling_qwen2_moe.py | 80 ---- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 4 - .../test_modeling_recurrent_gemma.py | 4 - .../starcoder2/test_modeling_starcoder2.py | 80 ---- tests/models/t5/test_modeling_t5.py | 3 - tests/models/umt5/test_modeling_umt5.py | 3 - tests/models/whisper/test_modeling_whisper.py | 70 --- tests/models/zamba/test_modeling_zamba.py | 87 ---- tests/test_modeling_common.py | 425 ------------------ 46 files changed, 263 insertions(+), 2346 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index efe953db051cb3..6e6d5b8bdce71d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -378,10 +378,14 @@ def prepare_inputs_for_generation( # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 or Exception 3 + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) input_ids = input_ids[:, cache_position] @@ -414,7 +418,7 @@ def prepare_inputs_for_generation( for model_input_name in ["position_ids", "token_type_ids"]: model_input = kwargs.get(model_input_name) if model_input is not None: - if past_key_values: + if past_key_values is not None: model_input = model_input[:, -input_ids.shape[1] :] model_input = model_input.clone(memory_format=torch.contiguous_format) model_inputs[model_input_name] = model_input @@ -568,27 +572,34 @@ def _maybe_initialize_input_ids_for_generation( def _prepare_attention_mask_for_generation( self, - inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + inputs_tensor: torch.Tensor, + generation_config: GenerationConfig, + model_kwargs: Dict[str, Any], ) -> torch.LongTensor: + pad_token_id = generation_config._pad_token_tensor + eos_token_id = generation_config._eos_token_tensor + + # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model) + if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: + inputs_tensor = model_kwargs["input_ids"] + # No information for attention mask inference -> return default attention mask - default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) + default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device) if pad_token_id is None: return default_attention_mask - is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long] + is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] if not is_input_ids: return default_attention_mask is_pad_token_in_inputs = (pad_token_id is not None) and ( - isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any() + isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any() ) is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any() ) can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id - attention_mask_from_padding = inputs.ne(pad_token_id).long() + attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long() attention_mask = ( attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask @@ -2020,7 +2031,7 @@ def generate( if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) elif kwargs_has_attention_mask: # TODO (joao): generalize this check with other types of inputs diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index c40ee1f70f900c..85c109919da736 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -911,7 +911,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 1425a017dc0558..2025140bb6e36a 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -424,7 +424,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index f8bdb5bf8d5a7c..2aa6b2fa1d6fa5 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -657,7 +657,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values/pixel_values_videos and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) if inputs_embeds is None: diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index c18e1d1c9d86b1..109ddfb626d26b 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1562,7 +1562,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2578,7 +2578,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_outputs" not in model_kwargs: diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index d2f339afc41451..61f2ce414e1ddf 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1484,7 +1484,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2425,7 +2425,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_hidden_states" not in model_kwargs: diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 02efc7c344f7b8..a3b3de33fa66ee 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -534,7 +534,8 @@ def forward( if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values_images`/`pixel_values_videos` and `inputs_embeds` at the same " + "time, and must specify either one" ) legacy_processing = False diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index d552bf73442ce7..545b696d67370a 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -29,6 +29,7 @@ from transformers.testing_utils import ( is_flaky, require_accelerate, + require_flash_attn, require_optimum_quanto, require_torch, require_torch_gpu, @@ -136,6 +137,34 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): return config, filtered_inputs_dict + def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5): + """ + Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in + the following siturations: + 1. The sequences are the same + 2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical + """ + # scores doesn't include data regarding decoder input tokens + decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores) + output_matches = output_1.sequences == output_2.sequences + has_matching_outputs = output_matches.all() + has_matching_scores = None + if not has_matching_outputs: + for batch_idx in range(output_1.sequences.shape[0]): + batch_matches = output_matches[batch_idx] + if batch_matches.all(): + continue + first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False + first_mismatch_idx -= decoder_input_length + output_1_first_mismatch_scores = output_1.scores[first_mismatch_idx][batch_idx] + output_2_first_mismatch_scores = output_2.scores[first_mismatch_idx][batch_idx] + has_matching_scores = torch.allclose( + output_1_first_mismatch_scores, output_2_first_mismatch_scores, rtol=atol, atol=rtol + ) + if not has_matching_scores: + break + self.assertTrue(has_matching_outputs or has_matching_scores) + def _get_logits_processor_kwargs(self, do_sample=False, config=None): logits_processor_kwargs = { "bad_words_ids": [[1, 0]], @@ -426,7 +455,6 @@ def test_greedy_generate(self): def test_greedy_generate_dict_outputs(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate( @@ -453,13 +481,12 @@ def test_greedy_generate_dict_outputs(self): # Retrocompatibility check self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config) + self._check_outputs(output_generate, model.config) @pytest.mark.generate def test_greedy_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -486,7 +513,7 @@ def test_greedy_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_sample_generate(self): @@ -505,7 +532,6 @@ def test_sample_generate(self): def test_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate( @@ -533,7 +559,7 @@ def test_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2) + self._check_outputs(output_generate, model.config, num_return_sequences=2) @pytest.mark.generate def test_beam_search_generate(self): @@ -554,7 +580,6 @@ def test_beam_search_generate(self): def test_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -583,14 +608,16 @@ def test_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate def test_beam_search_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -623,10 +650,10 @@ def test_beam_search_generate_dict_outputs_use_cache(self): self._check_outputs( output_generate, - main_input, model.config, use_cache=True, - num_return_sequences=beam_kwargs["num_beams"], + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @require_accelerate @@ -675,7 +702,6 @@ def test_beam_sample_generate(self): def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -706,7 +732,10 @@ def test_beam_sample_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate @@ -765,7 +794,6 @@ def test_group_beam_search_generate(self): def test_group_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_diverse_beam_kwargs() @@ -794,7 +822,10 @@ def test_group_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) # TODO: @gante check why it is flaky @@ -859,7 +890,6 @@ def test_constrained_beam_search_generate(self): def test_constrained_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -899,7 +929,10 @@ def test_constrained_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate @@ -942,7 +975,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -968,7 +1000,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_contrastive_generate_low_memory(self): @@ -1064,14 +1096,10 @@ def test_beam_search_low_memory(self): @pytest.mark.generate @parameterized.expand([("random",), ("same",)]) - @is_flaky() # Read NOTE (1) below. If there are API issues, all attempts will fail. def test_assisted_decoding_matches_greedy_search(self, assistant_type): # This test ensures that the assisted generation does not introduce output changes over greedy search. - # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul - # shape differences -- and it may result in a different output. The input shape difference happens in the - # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at - # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. - # NOTE (2): It breaks the pattern in the tests above, for multiple reasons: + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. + # NOTE: It breaks the pattern in the tests above, for multiple reasons: # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to # prepare the assistant encoder outputs in the main generate body); # - assisted_decoding does not support `use_cache = False` @@ -1100,7 +1128,6 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1141,12 +1168,10 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): output_assisted = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_assisted) for output in (output_greedy, output_assisted): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) - @is_flaky() @pytest.mark.generate def test_prompt_lookup_decoding_matches_greedy_search(self): # This test ensures that the prompt lookup generation does not introduce output changes over greedy search. @@ -1175,7 +1200,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1208,10 +1232,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_prompt_lookup) for output in (output_greedy, output_prompt_lookup): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) @pytest.mark.generate def test_dola_decoding_sample(self): @@ -1231,7 +1254,6 @@ def test_dola_decoding_sample(self): # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # Encoder-decoder models are not supported if config.is_encoder_decoder: @@ -1259,7 +1281,7 @@ def test_dola_decoding_sample(self): "dola_layers": "low", } output_dola = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False)) + self._check_outputs(output_dola, model.config, use_cache=getattr(config, "use_cache", False)) @pytest.mark.generate def test_assisted_decoding_sample(self): @@ -1289,7 +1311,6 @@ def test_assisted_decoding_sample(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1321,7 +1342,7 @@ def test_assisted_decoding_sample(self): } output_assisted = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_assisted, main_input, config, use_cache=True) + self._check_outputs(output_assisted, config, use_cache=True) @pytest.mark.generate def test_prompt_lookup_decoding_stops_at_eos(self): @@ -1547,75 +1568,93 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the - # variable that holds the scaling factor, which is model-dependent) - if hasattr(config, "scale_embedding"): - config.scale_embedding = False - # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the # decoder) if config.is_encoder_decoder: continue + config.is_decoder = True # Skip models without explicit support - config.is_decoder = True model = model_class(config).to(torch_device).eval() if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys(): continue + # There are a few exception patterns in this test: + # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed + requires_inputs_ids = any( + model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"] + ) + # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex + # than calling the embedding layer with `input_ids`. Subcases of this exception: + # 2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag) + if hasattr(config, "scale_embedding"): + config.scale_embedding = False + # 2.B - Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the + # exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the + # checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images` + pixel_values_is_mutually_exclusive = any( + model_name in model_class.__name__.lower() + for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"] + ) + if pixel_values_is_mutually_exclusive: + inputs_dict.pop("pixel_values", None) + inputs_dict.pop("pixel_values_videos", None) + inputs_dict.pop("pixel_values_images", None) + # 2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds` + has_complex_embeds_computation = any( + model_name in model_class.__name__.lower() for model_name in ["moshi"] + ) + # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate, + # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input. + missing_attention_mask = "attention_mask" not in inputs_dict + + # Traditional way of generating text input_ids = inputs_dict.pop("input_ids") generation_kwargs = { "return_dict_in_generate": True, "output_scores": True, "num_beams": num_beams, "do_sample": False, + "max_new_tokens": 5, + "min_new_tokens": 5, # generate exactly 5 tokens } - - # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs) + outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output). + # The output of the two calls should be the same. inputs_embeds = model.get_input_embeddings()(input_ids) outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) + if not has_complex_embeds_computation: + self._check_similar_generate_outputs(outputs_from_ids, outputs_from_embeds) - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # If we pass different inputs_embeds, we should get different outputs (the output text may be the # same, but the logits will almost surely be different) random_embeds = torch.rand_like(inputs_embeds) outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=random_embeds, **generation_kwargs, **inputs_dict ) for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) + # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will + # be the same + if not (requires_inputs_ids or missing_attention_mask): + outputs_from_embeds_wo_ids = model.generate( + inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict + ) + outputs_from_embeds.sequences = outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :] + self._check_similar_generate_outputs(outputs_from_embeds_wo_ids, outputs_from_embeds) @pytest.mark.generate def test_generate_from_inputs_embeds_with_static_cache(self): @@ -1829,10 +1868,8 @@ def test_new_cache_format(self, num_beams, do_sample): @pytest.mark.generate def test_generate_with_static_cache(self): """ - Tests if StaticCache works if we set attn_implementation=static when generation. - This doesn't test if generation quality is good, but tests that models with - self._supports_static_cache don't throw an error when generating and return - a StaticCache object at the end. + Tests that generating with static cache give almost same results as with dynamic cache, and the output cache + has the expected shapes """ for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: @@ -1851,13 +1888,15 @@ def test_generate_with_static_cache(self): model = model_class(config).to(torch_device).eval() generation_kwargs = { - "max_length": None, "max_new_tokens": max_new_tokens, - "cache_implementation": "static", "return_dict_in_generate": True, # Required to return `past_key_values` + "output_scores": True, "use_cache": True, } + static_cache_generation = model.generate(**generation_kwargs, **inputs_dict, cache_implementation="static") + + # Check 1: The cache shapes must match the expected shapes max_cache_len = seq_length + max_new_tokens config = config.text_config if hasattr(config, "text_config") else config head_dim = ( @@ -1869,12 +1908,14 @@ def test_generate_with_static_cache(self): else config.num_key_value_heads ) num_hidden_layers = config.num_hidden_layers - results = model.generate(**generation_kwargs, **inputs_dict) - cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim) - self.assertTrue(isinstance(results.past_key_values, StaticCache)) - self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers) - self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape) + self.assertTrue(isinstance(static_cache_generation.past_key_values, StaticCache)) + self.assertTrue(len(static_cache_generation.past_key_values.key_cache) == num_hidden_layers) + self.assertTrue(static_cache_generation.past_key_values.key_cache[0].shape == cache_shape) + + # Check 2: The outputs must be similar to the case with dynamic cache + dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict) + self._check_similar_generate_outputs(dynamic_cache_generation, static_cache_generation) @require_optimum_quanto @pytest.mark.generate @@ -1908,25 +1949,32 @@ def test_generate_with_quant_cache(self): with self.assertRaises(ValueError): model.generate(**generation_kwargs, **inputs_dict) + @parameterized.expand( + [ + ("forward_only", False), # TODO (@joao): a few models failing. After fixed, this should not be "@slow" + ("end_to_end", True), # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix + ] + ) @pytest.mark.generate @require_torch_gpu @slow - @is_flaky() # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky - def test_generate_compile_fullgraph(self): + def test_generate_compile(self, _, end_to_end): """ - Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. + Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests + end-to-end compilation and forward pass compilation only. ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️ """ for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: self.skipTest("This model doesn't support static cache") + # TODO (joao) -- fix and enable me :) - if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): + if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): self.skipTest("whisper model end-to-end generate compile not yet supported") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # TODO (joao) -- fix and enable me :) - if config.is_encoder_decoder: + if end_to_end and config.is_encoder_decoder: self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported") model = model_class(config).to(torch_device) @@ -1941,27 +1989,33 @@ def test_generate_compile_fullgraph(self): generation_kwargs = { "do_sample": False, "max_new_tokens": 10, + "return_dict_in_generate": True, + "output_scores": True, } + # end-to-end works best with dynamic cache, forward compilation works best with static cache + if not end_to_end: + generation_kwargs["cache_implementation"] = "static" - max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"] - config = config.get_text_config() - past_key_values = StaticCache( - config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device - ) + # get eager + dynamic cache results for future comparison + dynamic_outputs = [] + for model_inputs in input_ids_sets: + dynamic_outputs.append(model.generate(model_inputs, **generation_kwargs)) + + # get compiled results + generation_config = copy.deepcopy(model.generation_config) + generation_config.update(**generation_kwargs) + torch.compiler.reset() + if end_to_end: + model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") + else: + model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead") + compiled_outputs = [] for model_inputs in input_ids_sets: - # eager dynamic cache - output_dynamic = model.generate(model_inputs, **generation_kwargs) - - # end-to-end compiled dynamic cache - torch.compiler.reset() - compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") - generation_config = copy.deepcopy(model.generation_config) - generation_config.update(**generation_kwargs) - output_compiled = compiled_generate( - model_inputs, generation_config=generation_config, past_key_values=past_key_values - ) - self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist()) + compiled_outputs.append(model.generate(model_inputs, generation_config=generation_config)) + + for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs): + self._check_similar_generate_outputs(dynamic_result, compiled_result) @pytest.mark.generate def test_generate_methods_with_num_logits_to_keep(self): @@ -1989,7 +2043,6 @@ def test_generate_methods_with_num_logits_to_keep(self): self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) @pytest.mark.generate - @is_flaky() # assisted generation tests are flaky (minor fp ops differences) def test_assisted_decoding_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): @@ -1998,6 +2051,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self): self.skipTest(reason="Stateful models don't support assisted generation") config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) + # NOTE: assisted generation only works with cache on at the moment. + if not hasattr(config, "use_cache"): + self.skipTest(reason=f"{model_class.__name__} doesn't support caching") config.use_cache = True config.is_decoder = True @@ -2010,14 +2066,16 @@ def test_assisted_decoding_with_num_logits_to_keep(self): "max_new_tokens": 10, "do_sample": False, "assistant_model": assistant_model, + "return_dict_in_generate": True, + "output_scores": True, } - assistant_model.generation_config.assistant_confidence_threshold = None # Setting num_logits_to_keep at 0 keeps all logits (old behavior) with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0) # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior) without_all_logits = model.generate(**inputs_dict, **generation_kwargs) - self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) + + self._check_similar_generate_outputs(with_all_logits, without_all_logits) @pytest.mark.generate def test_inherits_generation_mixin(self): @@ -2028,14 +2086,21 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): + def _test_attention_implementation(self, attn_implementation): + """ + Compares the output of generate with the eager attention implementation against other implementations. + NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence + this separate function. + """ max_new_tokens = 30 + support_flag = { + "sdpa": "_supports_sdpa", + "flash_attention_2": "_supports_flash_attn_2", + } for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") + if not getattr(model_class, support_flag[attn_implementation]): + self.skipTest(f"{model_class.__name__} does not support `attn_implementation={attn_implementation}`") config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() inputs_dict = {} @@ -2062,63 +2127,59 @@ def test_eager_matches_sdpa_generate(self): "do_sample": False, "return_dict_in_generate": True, "output_scores": True, + "use_cache": True, } - model_sdpa = model_class.from_pretrained( + model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, + attn_implementation="eager", ).to(torch_device) - res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs) - del model_sdpa + res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) + del model_eager gc.collect() - model_eager = model_class.from_pretrained( + model_attn = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, - attn_implementation="eager", + attn_implementation=attn_implementation, ).to(torch_device) - res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) - del model_eager + res_attn = model_attn.generate(**inputs_dict, **generate_kwargs) + del model_attn gc.collect() - # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this - # test would be flaky if we only checked the sequences. Two situations in which this test passes: - # 1. The sequences are the same - # 2. The sequences are different, but the scores up until the first mismatch are nearly identical - output_matches = res_eager.sequences == res_sdpa.sequences - has_matching_outputs = output_matches.all() - has_matching_scores = None - if not has_matching_outputs: - input_length = main_input.shape[1] - for batch_idx in range(res_eager.sequences.shape[0]): - batch_matches = output_matches[batch_idx] - if batch_matches.all(): - continue - first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False - first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens - sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx] - eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx] - has_matching_scores = torch.allclose( - sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3 - ) - if not has_matching_scores: - break + self._check_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3) - self.assertTrue(has_matching_outputs or has_matching_scores) + @pytest.mark.generate + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """Tests that generate has equivalent outputs with SDPA and eager attention implementations.""" + self._test_attention_implementation("sdpa") - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): - # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image - # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` - batch_size = main_input.shape[0] + @pytest.mark.flash_attn_test + @require_flash_attn + @require_torch_gpu + @slow + def test_eager_matches_fa2_generate(self): + """Tests that generate has equivalent outputs with FA2 and eager attention implementations.""" + # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing, + # check whether we still need the overwrites + self._test_attention_implementation("flash_attention_2") + + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length @@ -2129,19 +2190,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -2153,7 +2216,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2165,12 +2228,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -2182,7 +2245,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2213,7 +2276,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return past_key_values = output.past_key_values past_sequence_length = output.sequences.shape[-1] - 1 self._check_past_key_values_for_generate( - num_sequences_in_output, + internal_batch_size, past_key_values, seq_length=past_sequence_length, config=config, diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index eda51d21199f31..e4d0df141be2b9 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1532,8 +1532,3 @@ def test_retain_grad_hidden_states_attentions(self): @unittest.skip def test_save_load_fast_init_from_base(self): pass - - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bartforcausalLM - pass diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index aa9835d8cd67c1..25566027742507 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -511,11 +511,6 @@ def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bertforcausalLM - pass - def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index aad26ef147e83e..2a8e7633ba40c5 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -16,17 +16,14 @@ import unittest -import pytest import requests from parameterized import parameterized from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed from transformers.testing_utils import ( require_bitsandbytes, - require_flash_attn, require_read_token, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -329,43 +326,6 @@ def test_model_rope_scaling(self, scaling_type): # The output should be different for long inputs self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_flash_attn - @require_read_token - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - device_map={"": 0}, - ) - - processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") - texts = ["hi", "Hello this is a very long sentence"] - - processor.tokenizer.padding_side = "right" - - inputs = processor(text=texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = processor.tokenizer.batch_decode(output_native) - - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = processor.tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @unittest.skip("Chameleon forces some token ids to be -inf!") def test_batching_equivalence(self): pass diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index a888bdcd3bc7be..e8483f8c7c7d32 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -319,9 +319,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.6] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/gemma-2b" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None @@ -419,51 +416,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Gemma apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index 94670803daa998..7bca83f96d73ab 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -78,7 +78,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): test_pruning = False _is_stateful = True model_split_percents = [0.5, 0.6] - _torch_compile_test_ckpt = "google/gemma-2-9b" def setUp(self): self.model_tester = Gemma2ModelTester(self) diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index 32bce7cbfa615e..b92c5db815b77a 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -28,7 +28,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, require_torch_sdpa, slow, torch_device, @@ -306,10 +305,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, test_headmasking = False test_pruning = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "THUDM/glm-4-9b" - _torch_compile_test_revision = "refs/pr/15" - def setUp(self): self.model_tester = GlmModelTester(self) self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37) @@ -426,41 +421,6 @@ def test_custom_4d_attention_mask(self): torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """Overwrite the common test as the test is flaky on tiny models.""" - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15") - tokenizer.padding_side = "right" - - texts = ["hi", "Hello this is a very long sentence"] - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 6f6fba50dc123a..afc741cd502dec 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -17,14 +17,9 @@ import datetime import unittest -import pytest - -from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available +from transformers import GPTJConfig, is_torch_available from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, tooslow, torch_device, @@ -505,44 +500,6 @@ def test_model_from_pretrained(self): model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16) self.assertIsNotNone(model) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") - - texts = ["hi", "Hello this is a very long sentence"] - expected_outputs = [ - "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the", - "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it", - ] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - quantization_config = BitsAndBytesConfig(load_in_4bit=True) - - model = GPTJForCausalLM.from_pretrained( - "EleutherAI/gpt-j-6b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - revision="float16", - torch_dtype=torch.float16, - quantization_config=quantization_config, - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(expected_outputs, output_fa_2) - @require_torch class GPTJModelLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 1bcb6641803c04..97b59f5aa50621 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized -from transformers import AutoTokenizer, GraniteConfig, is_torch_available, set_seed +from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -303,9 +301,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerLM-3b" - def setUp(self): self.model_tester = GraniteModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37) @@ -423,46 +418,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm/PowerLM-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 124ce0c3bb5ae6..f2f76b9fa75bf3 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -302,9 +300,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerMoE-3b" - def setUp(self): self.model_tester = GraniteMoeModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37) @@ -422,46 +417,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index c2f0ef8ccd01d3..d19d10932bfcdc 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -770,13 +770,6 @@ def test_contrastive_generate_low_memory(self): def test_custom_4d_attention_mask(self): pass - @unittest.skip( - reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" - ) - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - pass - @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") def test_generate_compile_fullgraph(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 854b8b934578e0..042fecf4bd25f7 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -20,7 +20,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -420,50 +419,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index f0366e7b539a50..5dc352d22fe0c0 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -19,7 +19,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -180,10 +179,6 @@ def test_inputs_embeds(): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -337,10 +332,6 @@ def setUp(self): def test_inputs_embeds(): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -367,50 +358,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() @@ -526,31 +473,6 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - wte = model.get_input_embeddings() - - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) - out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - - self.assertTrue(torch.allclose(out_embeds, out_ids)) - @require_torch class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index 251f293f722661..ef0b5831587be1 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -539,93 +539,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Jamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index a04d8bba741a23..dc510f0ff040bb 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch JetMoe model.""" import gc -import tempfile import unittest import pytest @@ -377,85 +376,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: JetMoe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index de6c0b15d661f9..0f0b595d3d2306 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -438,12 +438,6 @@ def check_same_values(layer_1, layer_2): # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - @unittest.skip( - "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @slow def test_model_from_pretrained(self): model_name = "microsoft/kosmos-2-patch14-224" diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 824337d8bdda01..375ec1dd3e6f3a 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -26,7 +26,6 @@ from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -316,9 +315,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None @@ -585,43 +581,6 @@ def _reinitialize_config(base_config, new_kwargs): with self.assertRaises(KeyError): config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index 1a8cf04774531f..9b3a9563b58ddc 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -204,8 +204,8 @@ def test_generate_without_input_ids(self): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") @@ -276,12 +276,6 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - @unittest.skip( - reason="Mamba2 does not support generating with input embeddings (custom cache_position computation)" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch @slow diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 074dceae155214..df0007d666a077 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -21,7 +21,6 @@ import numpy as np from datasets import Audio, load_dataset -from packaging import version from parameterized import parameterized from pytest import mark @@ -745,22 +744,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): def test_sdpa_can_compile_dynamic(self): pass - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_gpu - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - n_iter = 3 - for model_class in self.all_model_classes: - model = model_class(config).to(torch_device) - model.forward = torch.compile(model.forward) - for i in range(n_iter): - _ = model(inputs_dict["input_values"].to(torch_device)) - @is_flaky() def test_batching_equivalence(self): super().test_batching_equivalence() diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index f2ee714bcdbafc..1538735ad78bd7 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Mistral model.""" import gc -import tempfile import unittest import pytest @@ -416,85 +415,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mistral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index b9b5faed851fe4..931bb1f17beccf 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Mixtral model.""" -import tempfile import unittest import pytest @@ -415,85 +414,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mixtral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 3efa7b778fb75c..5174247b895eea 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -126,7 +126,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else () test_pruning = False test_head_masking = False - _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision" def setUp(self): self.model_tester = MllamaText2TextModelTester(self) diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index b77a6ff10364ca..7d4b855c10d8bf 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -560,7 +560,7 @@ def _get_input_ids_and_config(self, batch_size=2): return config, input_ids, attention_mask, inputs_dict def prepare_config_and_inputs_for_generate(self, batch_size=2): - config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate() + config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate(batch_size=batch_size) # Make sure we only return `input_ids`. # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there. @@ -591,9 +591,11 @@ def _check_hidden_states_for_generate( [expected_shape] * len(iter_hidden_states), ) - def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` - super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences) + super()._check_outputs( + output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams + ) def _check_hidden_states_for_generate( self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 @@ -655,59 +657,6 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) - @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - for model_class in self.all_generative_model_classes: - config, input_ids, _, inputs_dict = self._get_input_ids_and_config() - - model = model_class(config).to(torch_device).eval() - generation_kwargs = { - "return_dict_in_generate": True, - "output_scores": True, - "num_beams": num_beams, - "do_sample": False, - } - - # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) - @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs") def test_generate_continue_from_past_key_values(self): pass diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 20412da2e1db06..1628d3a5893eaa 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -576,9 +576,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small MT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/mt5-small" - def setUp(self): self.model_tester = MT5ModelTester(self) self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 346ad60debe23f..963cace28d6e41 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -450,144 +450,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -1585,149 +1447,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f3b6be0ac652eb..957db9f23b0f21 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1437,149 +1437,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 13adfe1e579489..37a581a33866ce 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -92,8 +92,6 @@ class NemotronModelTest(GemmaModelTest): test_pruning = False fx_compatible = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf" # used in `test_torch_compile_for_training` _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 95ae59dfc08fca..1d96b9c338fef0 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -346,10 +346,6 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self): def test_generate_from_inputs_embeds_with_static_cache(self): pass - @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results") - def test_static_cache_matches_dynamic(self): - pass - @unittest.skip("FlashAttention only support fp16 and bf16 data type") def test_flash_attn_2_fp32_ln(self): pass diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py index c17f69a499866b..eae6789bef252e 100644 --- a/tests/models/phi/test_modeling_phi.py +++ b/tests/models/phi/test_modeling_phi.py @@ -17,15 +17,11 @@ import unittest -import pytest from parameterized import parameterized from transformers import PhiConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -468,43 +464,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1 - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @slow @require_torch diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 4e57f8e0f002fb..f51dc2e0a5e26f 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2 model.""" import gc -import tempfile import unittest import pytest @@ -428,85 +427,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index c545e882faeeb3..abc7b57919b083 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2MoE model.""" import gc -import tempfile import unittest import pytest @@ -453,85 +452,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2Moe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index e1cd715f8f1d34..a3272853a78427 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -301,10 +301,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @unittest.skip(reason="CPU offload is not yet supported") def test_cpu_offload(self): pass diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index 542955f9fa4511..985115d7707b6e 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -420,10 +420,6 @@ def _check_hidden_states_for_generate( def test_initialization(self): pass - @unittest.skip(reason="RecurrentGemma does not support generating with input embeddings (missing position_ids)") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch_accelerator @slow diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py index 32d28143d72ffa..df743f132c1140 100644 --- a/tests/models/starcoder2/test_modeling_starcoder2.py +++ b/tests/models/starcoder2/test_modeling_starcoder2.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Starcoder2 model.""" -import tempfile import unittest import pytest @@ -404,85 +403,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Starcoder2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 68dd5a52b3d69b..b03416390766d0 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -580,9 +580,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small T5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google-t5/t5-small" - def setUp(self): self.model_tester = T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index ec4c1d019b6d17..377668851c5815 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -317,9 +317,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin # The small UMT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/umt5-small" - def setUp(self): self.model_tester = UMT5ModelTester(self) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index b24c577a16e575..12aedaca8cf986 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1574,59 +1574,6 @@ def test_generate_output_type(self, return_dict_in_generate): ) assert isinstance(pred_ids, expected_output_type) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name][..., :10] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - _ = model.generate( - dummy_input, - decoder_input_ids=output.sequences, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - def test_labels_sequence_max_length_correct(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -3961,11 +3908,6 @@ def test_generate_without_input_ids(self): # generate only works with input ids for whisper pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for whisper - pass - @unittest.skip(reason="Decoder can't keep attention grads") def test_retain_grad_hidden_states_attentions(self): return @@ -3974,18 +3916,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_save_load_fast_init_from_base(self): pass - @unittest.skip( - reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_reuse_cache(self): - pass - - @unittest.skip( - "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip( "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" ) diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index c0a8020bedd76a..a6dd516f98a412 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -542,93 +542,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Zamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index d88b0dc5f02f83..e2719d8cf1b600 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -22,7 +22,6 @@ import random import re import tempfile -import time import warnings from collections import defaultdict from contextlib import contextmanager @@ -37,10 +36,7 @@ from transformers import ( AutoModel, AutoModelForCausalLM, - AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, - AutoTokenizer, - GenerationConfig, PretrainedConfig, PreTrainedModel, is_torch_available, @@ -86,7 +82,6 @@ require_deepspeed, require_flash_attn, require_non_xpu, - require_read_token, require_safetensors, require_torch, require_torch_accelerator, @@ -3000,71 +2995,6 @@ def test_inputs_embeds_matches_input_ids(self): )[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_generative_model_classes: - if model_class.__name__ not in [ - *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), - *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), - ]: - continue - - model = model_class(config) - model.to(torch_device) - model.eval() - - model_forward_args = inspect.signature(model.forward).parameters - if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]): - self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.") - has_inputs_embeds_forwarding = "inputs_embeds" in set( - inspect.signature(model.prepare_inputs_for_generation).parameters.keys() - ) - if not has_inputs_embeds_forwarding: - self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.") - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged - # embeds already - if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): - inputs.pop("pixel_values", None) - inputs.pop("pixel_values_videos", None) - inputs.pop("pixel_values_images", None) - - wte = model.get_input_embeddings() - if not self.is_encoder_decoder: - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:] - out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - else: - encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - del inputs["input_ids"] - inputs.pop("decoder_input_ids", None) - inputs_embeds = wte(encoder_input_ids) - decoder_inputs_embeds = wte(decoder_input_ids) - out_ids = model.generate( - input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2 - )[:, -2:] - out_embeds = model.generate( - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - **inputs, - max_new_tokens=2, - ) - # NOTE: this test changes the order of FP ops, there may be tiny differences in the output - number_of_different_tokens = (out_ids != out_embeds).sum() - max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1) - self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch - @require_non_xpu @require_torch_multi_gpu def test_multi_gpu_data_parallel_forward(self): @@ -3857,102 +3787,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - @is_flaky() - def test_flash_attn_2_generate_left_padding(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @is_flaky() - @slow - def test_flash_attn_2_generate_padding_right(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - def test_attn_implementation_composite_models(self): """ Tests if composite models can receive a dict object as attn_implementation, where each key should be @@ -4525,65 +4359,6 @@ def test_sdpa_matches_eager_sliding_window(self): torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4) ) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - - # Generate with one batch only to test generation when attention mask will be None - # when real inputs are used, because there is no padding. See issue #32237 for more - dummy_input = dummy_input[:1, ...] - dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...]) - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @mark.flash_attn_test @@ -4640,62 +4415,6 @@ def test_flash_attn_2_can_dispatch_composite_models(self): if not has_fa2: raise ValueError("The FA2 model should have FA2 layers") - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1) - _ = model.generate( - dummy_input_updated, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - @require_flash_attn @require_torch_gpu @require_bitsandbytes @@ -4999,82 +4718,6 @@ def test_custom_4d_attention_mask(self): normalized_1 = F.softmax(out_shared_prefix_last_tokens) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - def test_static_cache_matches_dynamic(self): - """ - Tests that generating with static cache give almost same results as with dynamic cache. - This test does not compile the model and check only logits similarity for numerical precision - errors. - """ - if len(self.all_generative_model_classes) == 0: - self.skipTest( - reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks" - ) - for model_class in self.all_generative_model_classes: - if not model_class._supports_static_cache: - self.skipTest(f"{model_class.__name__} does not support static cache") - - if not model_class._supports_cache_class: - self.skipTest(f"{model_class.__name__} does not support cache class") - - config, inputs = self.model_tester.prepare_config_and_inputs_for_common() - if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: - self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") - - model = model_class(config).to(device=torch_device, dtype=torch.float32) - model.eval() - - dynamic_out = model.generate( - **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True - ) - static_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=10, - cache_implementation="static", - output_logits=True, - return_dict_in_generate=True, - ) - self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4)) - - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_accelerator - @require_read_token - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - torch.compiler.reset() - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - batch_size = 1 - n_iter = 3 - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - model.generation_config.max_new_tokens = 4 - - model.generation_config.cache_implementation = "static" - model.forward = torch.compile(model.forward, mode="reduce-overhead") - - input_text = "Why dogs are cute?" - input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device) - - for i in range(n_iter): - _ = model.generate(**input_ids, do_sample=False) - @slow @require_torch_gpu def test_torch_compile_for_training(self): @@ -5118,74 +4761,6 @@ def test_torch_compile_for_training(self): for name, param in model._orig_mod.named_parameters(): torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4) - @slow - @require_torch_gpu # Testing cuda graphs. - @require_read_token - def test_compile_cuda_graph_time(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested. - # At the moment, only llama, gemma and gemma2 are tested here! - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - cache_implementation = "static" - if model.config.model_type == "gemma2": - cache_implementation = "hybrid" - - new_tokens = 50 - gen_config = GenerationConfig( - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, - use_cache=True, - pad_token_id=tokenizer.pad_token_id, - num_beams=1, - do_sample=False, - eos_token_id=None, # This is required for min_new_tokens to actually have an effect. - ) - model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - - model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) - - inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device) - - # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - graph_warmup_time = end - start - - # Second run: CUDA Graph recording, and replays it - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - record_time = end - start - - # Finally: we hit the optimized, CUDA Graph replay path - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - opt_time = end - start - - # For the recording step, we expect only two cuda graphs and this step should be much faster than the first. - self.assertTrue(record_time < 0.15 * graph_warmup_time) - self.assertTrue(opt_time < record_time) - def test_forward_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): From 241d79026f1030124dbb957de936b3d617b621f2 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:17:20 +0100 Subject: [PATCH 154/385] fix pixtral processor (#34486) * fix pixtral processor * test out full length batches + remove undue ValueError * fix up processing * fix tests * fix * last fixup * style * [run-slow] pixtral * [run-slow] pixtral * fix config key * skip torchscript tests * [run-slow] pixtral * add missing key * [run-slow] pixtral * fix docs * [run-slow] pixtral * fix wrong url for integration test * [run-slow] pixtral * pixtralVisionModel does not have a lm head * [run-slow] pixtral --- .../models/pixtral/configuration_pixtral.py | 4 ++ .../models/pixtral/modeling_pixtral.py | 2 +- .../models/pixtral/processing_pixtral.py | 15 +++---- tests/models/pixtral/test_modeling_pixtral.py | 41 +------------------ .../models/pixtral/test_processor_pixtral.py | 21 +++++++++- 5 files changed, 35 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 32325a929411ba..14db51b947e664 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -52,6 +52,8 @@ class PixtralVisionConfig(PretrainedConfig): Dropout probability for the attention layers. rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Example: @@ -82,6 +84,7 @@ def __init__( hidden_act="gelu", attention_dropout=0.0, rope_theta=10000.0, + initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) @@ -97,3 +100,4 @@ def __init__( self.hidden_act = hidden_act self.rope_theta = rope_theta self.head_dim = hidden_size // num_attention_heads + self.initializer_range = initializer_range diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 06b9701a75661a..b65fbd634ba789 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -407,7 +407,7 @@ def _init_weights(self, module): std = ( self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.config.text_config.initializer_range + else self.config.initializer_range ) if isinstance(module, (nn.Linear, nn.Conv2d)): diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 70d28fb7b79c93..5913e8688d00be 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -206,14 +206,15 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] - elif ( - not isinstance(images, list) - and not isinstance(images[0], list) - and not is_image_or_image_url(images[0][0]) - ): + if isinstance(text, list): + images = [[im] for im in images] + else: + images = [images] + elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): + pass + else: raise ValueError( - "Invalid input images. Please provide a single image or a list of images or a list of list of images." + "Invalid input images. Please provide a single image, a list of images, or a list of lists of images." ) images = [[load_image(im) for im in sample] for sample in images] image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"]) diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py index 9a128f6ad28823..0c36cb5a4e0554 100644 --- a/tests/models/pixtral/test_modeling_pixtral.py +++ b/tests/models/pixtral/test_modeling_pixtral.py @@ -14,22 +14,16 @@ # limitations under the License. """Testing suite for the PyTorch Pixtral model.""" -import gc import unittest -import requests - from transformers import ( - AutoProcessor, PixtralVisionConfig, PixtralVisionModel, is_torch_available, is_vision_available, ) from transformers.testing_utils import ( - require_bitsandbytes, require_torch, - slow, torch_device, ) @@ -43,7 +37,7 @@ is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): - from PIL import Image + pass class PixtralVisionModelTester: @@ -148,6 +142,7 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (PixtralVisionModel,) if is_torch_available() else () test_pruning = False test_head_masking = False + test_torchscript = False def setUp(self): self.model_tester = PixtralVisionModelTester(self) @@ -258,35 +253,3 @@ def test_disk_offload_safetensors(self): @unittest.skip(reason="Not supported yet") def test_determinism(self): pass - - -@require_torch -class PixtralVisionModelIntegrationTest(unittest.TestCase): - def setUp(self): - self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b") - - def tearDown(self): - gc.collect() - torch.cuda.empty_cache() - - @slow - @require_bitsandbytes - def test_small_model_integration_test(self): - # Let' s make sure we test the preprocessing to replace what is used - model = PixtralVisionModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True) - - prompt = "[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]" - image_file = "https://pixtral-vl.github.io/static/images/view.jpg" - raw_image = Image.open(requests.get(image_file, stream=True).raw) - inputs = self.processor(prompt, raw_image, return_tensors="pt") - - EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip - self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS)) - - output = model.generate(**inputs, max_new_tokens=20) - EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip - - self.assertEqual( - self.processor.decode(output[0], skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index 8cdbf93c6476b8..c3496dff3cdf81 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -171,7 +171,7 @@ def test_processor_with_multiple_images_single_list(self): input_ids[0].tolist(), # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] - ) + ) # fmt: on # Test passing in a url @@ -246,6 +246,25 @@ def test_processor_with_multiple_images_multiple_lists(self): ) # fmt: on + def test_processor_returns_full_length_batches(self): + # to avoid https://github.com/huggingface/transformers/issues/34204 + processor = self.processor_class.from_pretrained(self.tmpdirname) + prompt_string = [ + "USER: [IMG]\nWhat's the content of the image? ASSISTANT:", + ] * 5 + processor.tokenizer.pad_token = "" + image_inputs = [self.image_0] * 5 + + # Make small for checking image token expansion + processor.image_processor.size = {"longest_edge": 30} + processor.image_processor.patch_size = {"height": 2, "width": 2} + + # Test passing in an image + inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True) + self.assertIn("input_ids", inputs_image) + self.assertTrue(len(inputs_image["input_ids"]) == 5) + self.assertTrue(len(inputs_image["pixel_values"]) == 5) + # Override as PixtralProcessor needs nested images to work properly with batched inputs @require_vision def prepare_image_inputs(self, batch_size: Optional[int] = None): From eab6c491d439e83d5e31c660df6f7e36592eb0a2 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:54:10 +0100 Subject: [PATCH 155/385] Use torch 2.5 in scheduled CI (#34465) * torch 2.5 * try --------- Co-authored-by: ydshieh --- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-pytorch-gpu/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 7ad4e96d62cde7..b597f5a73fb5be 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' # (not always a valid torch version) ARG INTEL_TORCH_EXT='2.3.0' # Example: `cu102`, `cu113`, etc. diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 62578ad0f3610f..f22d77b9372d7e 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -11,7 +11,7 @@ ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF # If set to nothing, will install the latest version -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' ARG TORCH_VISION='' ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. From 5251fe6271bec670f71a6c1a86f4a2049fb03a90 Mon Sep 17 00:00:00 2001 From: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:52:17 +0100 Subject: [PATCH 156/385] Add GGUF for Mamba (#34200) * add mamba architecture for gguf * add logic for weights conversion, some fixes and refactoring * add lm_head layers, unit test refactoring * more fixes for tests * remove lm_head creation * remove unused comments --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 25 +++++++++ .../modeling_gguf_pytorch_utils.py | 13 +++++ tests/quantization/ggml/test_ggml.py | 56 ++++++++++++++++++- 4 files changed, 93 insertions(+), 2 deletions(-) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 20531b990bc341..2da721b28986af 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -86,6 +86,7 @@ For now the supported model architectures are the architectures that have been v - GPT2 - Starcoder2 - T5 +- Mamba ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 4a2740fcb30e1c..f4545f2698c017 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -235,6 +235,19 @@ "output.weight": "lm_head.weight", "output_norm": "model.norm", }, + "mamba": { + "token_embd": "backbone.embeddings", + "blk": "backbone.layers", + "ssm_a": "mixer.A_log", + "ssm_conv1d": "mixer.conv1d", + "ssm_in": "mixer.in_proj", + "ssm_out": "mixer.out_proj", + "ssm_x": "mixer.x_proj", + "ssm_dt": "mixer.dt_proj", + "attn_norm": "norm", + "output_norm": "backbone.norm_f", + "output.weight": "lm_head.weight", + }, } @@ -373,6 +386,17 @@ "attention.head_count_kv": "num_key_value_heads", "attention.layer_norm_epsilon": "norm_epsilon", }, + "mamba": { + "vocab_size": "vocab_size", + "context_length": "max_position_embeddings", + "embedding_length": "hidden_size", + "attention.layer_norm_rms_epsilon": "layer_norm_epsilon", + "block_count": "num_hidden_layers", + "ssm.conv_kernel": "conv_kernel", + "ssm.state_size": "state_size", + "ssm.time_step_rank": "time_step_rank", + "ssm.inner_size": "intermediate_size", + }, } GGUF_TOKENIZER_MAPPING = { @@ -768,6 +792,7 @@ def converted(self) -> Tokenizer: "gpt2": GGUFGPTConverter, "starcoder2": GGUFGPTConverter, "t5": GGUFT5Converter, + "mamba": GGUFGPTConverter, } diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 171b2f4d15b122..c784ca0eb4ca2c 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -220,6 +220,19 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): name = "lm_head.weight" parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) continue + if architecture == "mamba": + if "ssm_d" in name and "bias" not in name and "weight" not in name: + # ssm_d has conflicts with ssm_dt in name checking + # we have to explicitly check that name is exactly ssm_d + name = name.replace("ssm_d", "mixer.D") + if "ssm_conv1d.weight" in name: + # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, + # quantized one is (5120, 4) + weights = np.expand_dims(weights, axis=1) + if "ssm_a" in name: + # Original exponential implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 + weights = np.log(-weights) for tensor_name in tensor_key_mapping: if tensor_name.format(bid=bid) in name: diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index ddc791e96a6489..da1af9bff8df90 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -59,6 +59,8 @@ class GgufIntegrationTests(unittest.TestCase): starcoder2_model_id = "QuantFactory/starcoder2-3b-GGUF" starcoder2_fp16_model_id = "brittlewis12/starcoder2-3b-GGUF" starcoder2_original_model_id = "bigcode/starcoder2-3b" + mamba_original_model_id = "state-spaces/mamba-2.8b-hf" + mamba_model_id = "jpodivin/mamba-2.8b-hf-GGUF" # standard quants q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" @@ -102,6 +104,8 @@ class GgufIntegrationTests(unittest.TestCase): q6_k_gpt2_xl_model_id = "gpt2-xl.Q6_K.gguf" q6_k_starcoder2_model_id = "starcoder2-3b.Q6_K.gguf" fp16_starcoder2_gguf_model_id = "starcoder2-3b.fp16.gguf" + q6_k_mamba_model_id = "ggml-model-Q6_K.gguf" + fp16_mamba_model_id = "ggml-model-f16.gguf" example_text = "Hello" @@ -573,6 +577,8 @@ def test_gpt2_weights_conversion_fp16(self): if layer_name in quantized_state_dict: self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape) torch.testing.assert_close(original_params, quantized_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_gpt2_xl_Q6_K(self): tokenizer = AutoTokenizer.from_pretrained(self.gpt2_xl_model_id, gguf_file=self.q6_k_gpt2_xl_model_id) @@ -639,6 +645,8 @@ def test_falcon7b_weights_conversion_fp16(self): if layer_name in quantized_state_dict: self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape) torch.testing.assert_close(original_params, quantized_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_stablelm_q4_k_m(self): model = AutoModelForCausalLM.from_pretrained( @@ -708,6 +716,8 @@ def test_stablelm_weights_conversion_fp16(self): if layer_name in converted_state_dict: self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_starcoder2_weights_conversion_fp16(self): original_model = AutoModelForCausalLM.from_pretrained( @@ -727,10 +737,11 @@ def test_starcoder2_weights_conversion_fp16(self): original_state_dict = original_model.state_dict() for layer_name, original_params in original_state_dict.items(): - if layer_name in converted_state_dict and layer_name != "lm_head.weight": - # quantized models do not contain "lm_head.weight" layer + if layer_name in converted_state_dict: self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_starcoder2_q6_k(self): example_function_text = "def print_hello_world():" @@ -748,6 +759,47 @@ def test_starcoder2_q6_k(self): EXPECTED_TEXT = 'def print_hello_world():\n print("Hello World")\n\ndef print' self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_mamba_weights_conversion_fp16(self): + original_model = AutoModelForCausalLM.from_pretrained( + self.mamba_original_model_id, + torch_dtype=torch.float16, + ) + + converted_model = AutoModelForCausalLM.from_pretrained( + self.mamba_model_id, + gguf_file=self.fp16_mamba_model_id, + torch_dtype=torch.float16, + ) + + converted_state_dict = converted_model.state_dict() + original_state_dict = original_model.state_dict() + + for layer_name, original_params in original_state_dict.items(): + if layer_name in converted_state_dict: + self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) + if "mixer.A_log" in layer_name: + # we should increase tolerance after exponential reversing + # and performing np.log(-weights) operation as numbers are slightly different + torch.testing.assert_close(original_params, converted_state_dict[layer_name], atol=1e-3, rtol=1e-3) + else: + torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") + + def test_mamba_q6_k(self): + model = AutoModelForCausalLM.from_pretrained( + self.mamba_model_id, + gguf_file=self.q6_k_mamba_model_id, + torch_dtype=torch.float16, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.mamba_model_id, gguf_file=self.q6_k_mamba_model_id) + text = tokenizer(self.example_text, return_tensors="pt")["input_ids"] + out = model.generate(text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello,I answerthe question.\n\nA" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_tokenization_xnli(self): import tqdm from datasets import load_dataset From 9f06fb05059a973048f5865e7e385c9db5d6daa4 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Wed, 30 Oct 2024 23:55:16 +0800 Subject: [PATCH 157/385] Fix super tiny extra space typo (#34440) Update training_args.py --- src/transformers/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 3e5c6cc2f37428..0653c8a2cb7bf0 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1030,7 +1030,7 @@ class TrainingArguments: use_cpu: bool = field( default=False, metadata={ - "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available." + "help": "Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available." }, ) use_mps_device: bool = field( From 48872fd6ae336fbde6fac7706910a9a4bc48210e Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:49:47 -0400 Subject: [PATCH 158/385] Add Image Processor Fast RT-DETR (#34354) * add fast image processor rtdetr * add gpu/cpu test and fix docstring * remove prints * add to doc * nit docstring * avoid iterating over images/annotations several times * change torch typing * Add image processor fast documentation --- .../source/en/main_classes/image_processor.md | 43 + docs/source/en/model_doc/rt_detr.md | 8 +- src/transformers/__init__.py | 4 +- .../image_processing_utils_fast.py | 67 +- .../models/auto/image_processing_auto.py | 2 +- .../models/detr/image_processing_detr_fast.py | 208 ++--- src/transformers/models/rt_detr/__init__.py | 2 + .../rt_detr/image_processing_rt_detr.py | 17 +- .../rt_detr/image_processing_rt_detr_fast.py | 798 ++++++++++++++++++ .../utils/dummy_vision_objects.py | 7 + .../models/detr/test_image_processing_detr.py | 4 +- .../rt_detr/test_image_processing_rt_detr.py | 424 ++++++---- 12 files changed, 1259 insertions(+), 325 deletions(-) create mode 100644 src/transformers/models/rt_detr/image_processing_rt_detr_fast.py diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 59a78e68214d6d..320916f1ce9421 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -18,6 +18,49 @@ rendered properly in your Markdown viewer. An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks. +Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU. +They have the same API as the base image processors and can be used as drop-in replacements. +To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor: + +```python +from transformers import AutoImageProcessor + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) +``` + +When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. + +```python +from torchvision.io import read_image +from transformers import DetrImageProcessorFast + +images = read_image("image.jpg") +processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50") +images_processed = processor(images, return_tensors="pt", device="cuda") +``` + +Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time: + +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. + ## ImageProcessingMixin diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 5540266c6215de..8ad220dc4bd113 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -46,7 +46,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> from PIL import Image >>> from transformers import RTDetrForObjectDetection, RTDetrImageProcessor ->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") @@ -95,6 +95,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - preprocess - post_process_object_detection +## RTDetrImageProcessorFast + +[[autodoc]] RTDetrImageProcessorFast + - preprocess + - post_process_object_detection + ## RTDetrModel [[autodoc]] RTDetrModel diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cc8b07395024a8..e6789c77fb825a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1228,7 +1228,7 @@ _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) - _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) + _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor", "RTDetrImageProcessorFast"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.seggpt"].extend(["SegGptImageProcessor"]) @@ -6152,7 +6152,7 @@ ) from .models.pvt import PvtImageProcessor from .models.qwen2_vl import Qwen2VLImageProcessor - from .models.rt_detr import RTDetrImageProcessor + from .models.rt_detr import RTDetrImageProcessor, RTDetrImageProcessorFast from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.seggpt import SegGptImageProcessor diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index d1a08132d73d89..3c1be325b7eb30 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,14 +15,18 @@ import functools from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Tuple from .image_processing_utils import BaseImageProcessor -from .utils.import_utils import is_torchvision_available +from .utils.import_utils import is_torch_available, is_torchvision_available if is_torchvision_available(): from torchvision.transforms import Compose +if is_torch_available(): + import torch + @dataclass(frozen=True) class SizeDict: @@ -66,3 +70,64 @@ def to_dict(self): encoder_dict = super().to_dict() encoder_dict.pop("_transform_params", None) return encoder_dict + + +def get_image_size_for_max_height_width( + image_size: Tuple[int, int], + max_height: int, + max_width: int, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + image_size (`Tuple[int, int]`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + """ + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor": + """ + Squeezes a tensor, but only if the axis specified has dim 1. + """ + if axis is None: + return tensor.squeeze() + + try: + return tensor.squeeze(axis=axis) + except ValueError: + return tensor + + +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]: + """ + Get the maximum height and width across all images in a batch. + """ + + _, max_height, max_width = max_across_indices([img.shape for img in images]) + + return (max_height, max_width) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index d181afeb2d4d0d..5698abe15c8029 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -123,7 +123,7 @@ ("qwen2_vl", ("Qwen2VLImageProcessor",)), ("regnet", ("ConvNextImageProcessor",)), ("resnet", ("ConvNextImageProcessor",)), - ("rt_detr", "RTDetrImageProcessor"), + ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), ("sam", ("SamImageProcessor",)), ("segformer", ("SegformerImageProcessor",)), ("seggpt", ("SegGptImageProcessor",)), diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 0fa1d0ffd9dba9..eadde59e55e475 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -21,7 +21,13 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) from ...image_transforms import ( center_to_corners_format, corners_to_center_format, @@ -55,7 +61,6 @@ compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, - max_across_indices, remove_low_and_no_objects, ) @@ -85,60 +90,6 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -def get_image_size_for_max_height_width( - image_size: Tuple[int, int], - max_height: int, - max_width: int, -) -> Tuple[int, int]: - """ - Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. - Important, even if image_height < max_height and image_width < max_width, the image will be resized - to at least one of the edges be equal to max_height or max_width. - - For example: - - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) - - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) - - Args: - image_size (`Tuple[int, int]`): - The image to resize. - max_height (`int`): - The maximum allowed height. - max_width (`int`): - The maximum allowed width. - """ - height, width = image_size - height_scale = max_height / height - width_scale = max_width / width - min_scale = min(height_scale, width_scale) - new_height = int(height * min_scale) - new_width = int(width * min_scale) - return new_height, new_width - - -def safe_squeeze(tensor: torch.Tensor, axis: Optional[int] = None) -> torch.Tensor: - """ - Squeezes a tensor, but only if the axis specified has dim 1. - """ - if axis is None: - return tensor.squeeze() - - try: - return tensor.squeeze(axis=axis) - except ValueError: - return tensor - - -def get_max_height_width(images: List[torch.Tensor]) -> Tuple[int]: - """ - Get the maximum height and width across all images in a batch. - """ - - _, max_height, max_width = max_across_indices([img.shape for img in images]) - - return (max_height, max_width) - - # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: """ @@ -191,18 +142,21 @@ def prepare_coco_detection_annotation( # Get all COCO annotations for the given image. annotations = target["annotations"] - annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) - classes = [obj["category_id"] for obj in annotations] classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) - - # for conversion to coco api - area = torch.as_tensor([obj["area"] for obj in annotations], dtype=torch.float32, device=image.device) - iscrowd = torch.as_tensor( - [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=torch.int64, device=image.device - ) - - boxes = [obj["bbox"] for obj in annotations] + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) # guard against no boxes via resizing boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) boxes[:, 2:] += boxes[:, :2] @@ -211,19 +165,16 @@ def prepare_coco_detection_annotation( keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) - new_target = {} - new_target["image_id"] = image_id - new_target["class_labels"] = classes[keep] - new_target["boxes"] = boxes[keep] - new_target["area"] = area[keep] - new_target["iscrowd"] = iscrowd[keep] - new_target["orig_size"] = torch.as_tensor( - [int(image_height), int(image_width)], dtype=torch.int64, device=image.device - ) + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } - if annotations and "keypoints" in annotations[0]: - keypoints = [obj["keypoints"] for obj in annotations] - # Converting the filtered keypoints list to a numpy array + if keypoints: keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) # Apply the keep mask here to filter the relevant annotations keypoints = keypoints[keep] @@ -911,84 +862,81 @@ def preprocess( if input_data_format == ChannelDimension.LAST: images = [image.permute(2, 0, 1).contiguous() for image in images] - # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) - if annotations is not None: - prepared_images = [] - prepared_annotations = [] - for image, target in zip(images, annotations): - target = self.prepare_annotation( + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( image, - target, + annotation, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path, input_data_format=input_data_format, ) - prepared_images.append(image) - prepared_annotations.append(target) - images = prepared_images - annotations = prepared_annotations - del prepared_images, prepared_annotations - - if do_resize: - if isinstance(resample, (PILImageResampling, int)): - interpolation = pil_torch_interpolation_mapping[resample] - else: - interpolation = resample - resized_images = [self.resize(image, size=size, interpolation=interpolation) for image in images] - if annotations is not None: - for i, (image, target) in enumerate(zip(resized_images, annotations)): - annotations[i] = self.resize_annotation( - target, - orig_size=images[i].size()[-2:], - target_size=image.size()[-2:], + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], ) - images = resized_images - del resized_images + image = resized_image - if do_rescale and do_normalize: - # fused rescale and normalize - new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) - new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) - images = [F.normalize(image.to(dtype=torch.float32), new_mean, new_std) for image in images] - elif do_rescale: - images = [image * rescale_factor for image in images] - elif do_normalize: - images = [F.normalize(image, image_mean, image_std) for image in images] - - if do_convert_annotations and annotations is not None: - annotations = [ - self.normalize_annotation(annotation, get_image_size(image, input_data_format)) - for annotation, image in zip(annotations, images) - ] + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None if do_pad: - # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + # depends on all resized image shapes so we need another loop if pad_size is not None: padded_size = (pad_size["height"], pad_size["width"]) else: padded_size = get_max_height_width(images) - annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] - pixel_masks = [] padded_annotations = [] - for image, annotation in zip(images, annotation_list): + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} if padded_size == image.size()[-2:]: padded_images.append(image) pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) padded_annotations.append(annotation) continue - padded_image, pixel_mask, padded_annotation = self.pad( + image, pixel_mask, annotation = self.pad( image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations ) - padded_images.append(padded_image) + padded_images.append(image) + padded_annotations.append(annotation) pixel_masks.append(pixel_mask) - padded_annotations.append(padded_annotation) images = padded_images - if annotations is not None: - annotations = padded_annotations - del padded_images, padded_annotations + annotations = padded_annotations if annotations is not None else None data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) data.update({"pixel_values": torch.stack(images, dim=0)}) diff --git a/src/transformers/models/rt_detr/__init__.py b/src/transformers/models/rt_detr/__init__.py index 94a428c66685a6..52453f38b2c4f4 100644 --- a/src/transformers/models/rt_detr/__init__.py +++ b/src/transformers/models/rt_detr/__init__.py @@ -26,6 +26,7 @@ pass else: _import_structure["image_processing_rt_detr"] = ["RTDetrImageProcessor"] + _import_structure["image_processing_rt_detr_fast"] = ["RTDetrImageProcessorFast"] try: if not is_torch_available(): @@ -55,6 +56,7 @@ pass else: from .image_processing_rt_detr import RTDetrImageProcessor + from .image_processing_rt_detr_fast import RTDetrImageProcessorFast try: if not is_torch_available(): diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index 44b2702aa634bc..eead5b18693d2f 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -1062,10 +1062,8 @@ def post_process_object_detection( raise ValueError( "Make sure that you pass in as many target sizes as the batch dimension of the logits" ) - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) + img_h, img_w = torch.as_tensor(target_sizes).unbind(1) else: img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) @@ -1089,10 +1087,13 @@ def post_process_object_detection( boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) + for score, label, box in zip(scores, labels, boxes): + results.append( + { + "scores": score[score > threshold], + "labels": label[score > threshold], + "boxes": box[score > threshold], + } + ) return results diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py new file mode 100644 index 00000000000000..9f63b5b7ced467 --- /dev/null +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -0,0 +1,798 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for RT-DETR.""" + +import functools +import pathlib +from typing import Any, Dict, List, Optional, Tuple, Union + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import ( + center_to_corners_format, + corners_to_center_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_size, + get_image_type, + infer_channel_dimension_format, + make_list_of_images, + pil_torch_interpolation_mapping, + validate_annotations, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, + requires_backends, +) +from .image_processing_rt_detr import ( + get_size_with_aspect_ratio, +) + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +logger = logging.get_logger(__name__) + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,) + + +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by RT-DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + return new_target + + +class RTDetrImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a fast RT-DETR DETR image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `False`): + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `False`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: Union[PILImageResampling, F.InterpolationMode] = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = False, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_convert_annotations: bool = True, + do_pad: bool = False, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> None: + size = size if size is not None else {"height": 640, "width": 640} + size = get_size_dict(size, default_to_square=False) + + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + + def prepare_annotation( + self, + image: torch.Tensor, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into RTDETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation + def resize_annotation( + self, + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad + def pad( + self, + image: torch.Tensor, + padded_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @functools.lru_cache(maxsize=1) + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments + def _validate_input_arguments( + self, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + do_resize: bool, + size: Dict[str, int], + resample: "PILImageResampling", + data_format: Union[str, ChannelDimension], + return_tensors: Union[TensorType, str], + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + @filter_out_non_signature_kwargs(extra=["device"]) + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=True) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + return_tensors = "pt" if return_tensors is None else return_tensors + device = kwargs.pop("device", None) + + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + + self._validate_input_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + return_tensors=return_tensors, + data_format=data_format, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + data = {} + if image_type == ImageType.PIL: + images = [F.pil_to_tensor(image) for image in images] + elif image_type == ImageType.NUMPY: + # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays + images = [torch.from_numpy(image).contiguous() for image in images] + + if device is not None: + images = [image.to(device) for image in images] + + # We assume that all images have the same channel dimension format. + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + if input_data_format == ChannelDimension.LAST: + images = [image.permute(2, 0, 1).contiguous() for image in images] + + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + # Copied from transformers.models.rt_detr.image_processing_rt_detr.RTDetrImageProcessor.post_process_object_detection + def post_process_object_detection( + self, + outputs, + threshold: float = 0.5, + target_sizes: Union[TensorType, List[Tuple]] = None, + use_focal_loss: bool = True, + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + use_focal_loss (`bool` defaults to `True`): + Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied + to compute the scores of each detection, otherwise, a softmax function is used. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + requires_backends(self, ["torch"]) + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + # convert from relative cxcywh to absolute xyxy + boxes = center_to_corners_format(out_bbox) + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + if isinstance(target_sizes, List): + img_h, img_w = torch.as_tensor(target_sizes).unbind(1) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + num_top_queries = out_logits.shape[1] + num_classes = out_logits.shape[2] + + if use_focal_loss: + scores = torch.nn.functional.sigmoid(out_logits) + scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1) + labels = index % num_classes + index = index // num_classes + boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1])) + else: + scores = torch.nn.functional.softmax(out_logits)[:, :, :-1] + scores, labels = scores.max(dim=-1) + if scores.shape[1] > num_top_queries: + scores, index = torch.topk(scores, num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + results = [] + for score, label, box in zip(scores, labels, boxes): + results.append( + { + "scores": score[score > threshold], + "labels": label[score > threshold], + "boxes": box[score > threshold], + } + ) + + return results diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d7f87717ca834a..19cf02a4e85826 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -569,6 +569,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class RTDetrImageProcessorFast(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class SamImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 976b306115b68a..f91c520873668f 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -677,7 +677,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} - processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50") + processor = self.image_processor_list[1]() # 1. run processor on CPU encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") # 2. run processor on GPU @@ -734,7 +734,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") - processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic") + processor = self.image_processor_list[1](format="coco_panoptic") # 1. run processor on CPU encoding_cpu = processor( images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu" diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py index 2a38664d433fea..e7bfbae3f9c27a 100644 --- a/tests/models/rt_detr/test_image_processing_rt_detr.py +++ b/tests/models/rt_detr/test_image_processing_rt_detr.py @@ -16,8 +16,8 @@ import requests -from transformers.testing_utils import require_torch, require_vision, slow -from transformers.utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -25,7 +25,7 @@ if is_vision_available(): from PIL import Image - from transformers import RTDetrImageProcessor + from transformers import RTDetrImageProcessor, RTDetrImageProcessorFast if is_torch_available(): import torch @@ -91,6 +91,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_vision class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = RTDetrImageProcessor if is_vision_available() else None + fast_image_processing_class = RTDetrImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -101,17 +102,19 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "resample")) - self.assertTrue(hasattr(image_processing, "do_rescale")) - self.assertTrue(hasattr(image_processing, "rescale_factor")) - self.assertTrue(hasattr(image_processing, "return_tensors")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "return_tensors")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 640, "width": 640}) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 640, "width": 640}) def test_valid_coco_detection_annotations(self): # prepare image and target @@ -121,32 +124,33 @@ def test_valid_coco_detection_annotations(self): params = {"image_id": 39769, "annotations": target} - # encode them - image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd") - # legal encodings (single image) - _ = image_processing(images=image, annotations=params, return_tensors="pt") - _ = image_processing(images=image, annotations=[params], return_tensors="pt") + # legal encodings (single image) + _ = image_processing(images=image, annotations=params, return_tensors="pt") + _ = image_processing(images=image, annotations=[params], return_tensors="pt") - # legal encodings (batch of one image) - _ = image_processing(images=[image], annotations=params, return_tensors="pt") - _ = image_processing(images=[image], annotations=[params], return_tensors="pt") + # legal encodings (batch of one image) + _ = image_processing(images=[image], annotations=params, return_tensors="pt") + _ = image_processing(images=[image], annotations=[params], return_tensors="pt") - # legal encoding (batch of more than one image) - n = 5 - _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") + # legal encoding (batch of more than one image) + n = 5 + _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") - # example of an illegal encoding (missing the 'image_id' key) - with self.assertRaises(ValueError) as e: - image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") + # example of an illegal encoding (missing the 'image_id' key) + with self.assertRaises(ValueError) as e: + image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") - self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) + self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) - # example of an illegal encoding (unequal lengths of images and annotations) - with self.assertRaises(ValueError) as e: - image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") + # example of an illegal encoding (unequal lengths of images and annotations) + with self.assertRaises(ValueError) as e: + image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") - self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") + self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") @slow def test_call_pytorch_with_coco_detection_annotations(self): @@ -157,55 +161,57 @@ def test_call_pytorch_with_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} - # encode them - image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") - encoding = image_processing(images=image, annotations=target, return_tensors="pt") - - # verify pixel values - expected_shape = torch.Size([1, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - expected_slice = torch.tensor([0.5490, 0.5647, 0.5725]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) - - # verify area - expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812]) - self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) - # verify boxes - expected_boxes_shape = torch.Size([6, 4]) - self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) - expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) - # verify image_id - expected_image_id = torch.tensor([39769]) - self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) - # verify is_crowd - expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) - # verify class_labels - expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) - self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) - # verify orig_size - expected_orig_size = torch.tensor([480, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) - # verify size - expected_size = torch.tensor([640, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd") + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.5490, 0.5647, 0.5725]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([640, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_image_processor_outputs(self): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - image_processing = self.image_processing_class(**self.image_processor_dict) - encoding = image_processing(images=image, return_tensors="pt") + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + encoding = image_processing(images=image, return_tensors="pt") - # verify pixel values: shape - expected_shape = torch.Size([1, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) + # verify pixel values: shape + expected_shape = torch.Size([1, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) - # verify pixel values: output values - expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5)) + # verify pixel values: output values + expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5)) def test_multiple_images_processor_outputs(self): images_urls = [ @@ -224,31 +230,32 @@ def test_multiple_images_processor_outputs(self): image = Image.open(requests.get(url, stream=True).raw) images.append(image) - # apply image processing - image_processing = self.image_processing_class(**self.image_processor_dict) - encoding = image_processing(images=images, return_tensors="pt") - - # verify if pixel_values is part of the encoding - self.assertIn("pixel_values", encoding) - - # verify pixel values: shape - expected_shape = torch.Size([8, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # verify pixel values: output values - expected_slices = torch.tensor( - [ - [0.5333333611488342, 0.5568627715110779, 0.5647059082984924], - [0.5372549295425415, 0.4705882668495178, 0.4274510145187378], - [0.3960784673690796, 0.35686275362968445, 0.3686274588108063], - [0.20784315466880798, 0.1882353127002716, 0.15294118225574493], - [0.364705890417099, 0.364705890417099, 0.3686274588108063], - [0.8078432083129883, 0.8078432083129883, 0.8078432083129883], - [0.4431372880935669, 0.4431372880935669, 0.4431372880935669], - [0.19607844948768616, 0.21176472306251526, 0.3607843220233917], - ] - ) - self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5)) + for image_processing_class in self.image_processor_list: + # apply image processing + image_processing = image_processing_class(**self.image_processor_dict) + encoding = image_processing(images=images, return_tensors="pt") + + # verify if pixel_values is part of the encoding + self.assertIn("pixel_values", encoding) + + # verify pixel values: shape + expected_shape = torch.Size([8, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # verify pixel values: output values + expected_slices = torch.tensor( + [ + [0.5333333611488342, 0.5568627715110779, 0.5647059082984924], + [0.5372549295425415, 0.4705882668495178, 0.4274510145187378], + [0.3960784673690796, 0.35686275362968445, 0.3686274588108063], + [0.20784315466880798, 0.1882353127002716, 0.15294118225574493], + [0.364705890417099, 0.364705890417099, 0.3686274588108063], + [0.8078432083129883, 0.8078432083129883, 0.8078432083129883], + [0.4431372880935669, 0.4431372880935669, 0.4431372880935669], + [0.19607844948768616, 0.21176472306251526, 0.3607843220233917], + ] + ) + self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5)) @slow def test_batched_coco_detection_annotations(self): @@ -277,89 +284,146 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = RTDetrImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 640, 640 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.5503, 0.2765, 0.0604, 0.2215], + [0.1695, 0.2016, 0.2080, 0.0940], + [0.5006, 0.4933, 0.9977, 0.9865], + [0.5008, 0.5002, 0.9983, 0.9955], + [0.2627, 0.5456, 0.4707, 0.8646], + [0.7715, 0.4115, 0.4570, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 640, 640 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] + @slow + @require_torch_gpu + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations + def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + processor = self.image_processor_list[1]() + # 1. run processor on CPU + encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") + # 2. run processor on GPU + encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda") + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) ) - expected_boxes_1 = torch.tensor( - [ - [0.5503, 0.2765, 0.0604, 0.2215], - [0.1695, 0.2016, 0.2080, 0.0940], - [0.5006, 0.4933, 0.9977, 0.9865], - [0.5008, 0.5002, 0.9983, 0.9955], - [0.2627, 0.5456, 0.4707, 0.8646], - [0.7715, 0.4115, 0.4570, 0.7161], - ] + # verify area + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu"))) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", + # verify image_id + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu")) ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + # verify is_crowd + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu")) + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify orig_size + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu")) + ) + # verify size + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))) From 405b56269812056d9593869e22b7b264d806cb1e Mon Sep 17 00:00:00 2001 From: anshumangahlot Date: Thu, 31 Oct 2024 01:07:39 +0530 Subject: [PATCH 159/385] UPDATE Documentation for #TRANSLATING.md Documentation into Multiple Languages.(Changes made) (#34226) * Update TRANSLATING.md * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update TRANSLATING.md --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/TRANSLATING.md | 81 ++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md index 49747821f476f0..64dced450987dc 100644 --- a/docs/TRANSLATING.md +++ b/docs/TRANSLATING.md @@ -1,57 +1,70 @@ -### Translating the Transformers documentation into your language +# Translating the Transformers documentation into your language -As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏. +As part of our mission to democratize machine learning, we aim to make the Transformers library available in many more languages! Follow the steps below to help translate the documentation into your language. -**🗞️ Open an issue** +## Open an Issue -To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button. +1. Navigate to the Issues page of this repository. +2. Check if anyone has already opened an issue for your language. +3. If not, create a new issue by selecting the "Translation template" from the "New issue" button. +4. Post a comment indicating which chapters you’d like to work on, and we’ll add your name to the list. -Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list. +## Fork the Repository +1. First, fork the Transformers repo by clicking the Fork button in the top-right corner. +2. Clone your fork to your local machine for editing with the following command: -**🍴 Fork the repository** + ```bash + git clone https://github.com/YOUR-USERNAME/transformers.git + ``` + + Replace `YOUR-USERNAME` with your GitHub username. -First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page. +## Copy-paste the English version with a new language code -Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows: +The documentation files are organized in the following directory: -```bash -git clone https://github.com/YOUR-USERNAME/transformers.git -``` +- **docs/source**: This contains all documentation materials organized by language. -**📋 Copy-paste the English version with a new language code** +To copy the English version to your new language directory: -The documentation files are in one leading directory: +1. Navigate to your fork of the repository: -- [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language. + ```bash + cd ~/path/to/transformers/docs + ``` -You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following: + Replace `~/path/to` with your actual path. -```bash -cd ~/path/to/transformers/docs -cp -r source/en source/LANG-ID -``` +2. Run the following command: -Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table. + ```bash + cp -r source/en source/LANG-ID + ``` -**✍️ Start translating** + Replace `LANG-ID` with the appropriate ISO 639-1 or ISO 639-2 language code (see [this table](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for reference). -The fun part comes - translating the text! +## Start translating -The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website. +Begin translating the text! -> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory! +1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website. -The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml): + - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections. + - Ensure it is placed in the `docs/source/LANG-ID/` directory. -```yaml -- sections: - - local: pipeline_tutorial # Do not change this! Use the same name for your .md file - title: Pipelines for inference # Translate this! - ... - title: Tutorials # Translate this! -``` + Here’s an example structure for the `_toctree.yml` file: -Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter. + ```yaml + - sections: + - local: pipeline_tutorial # Keep this name for your .md file + title: Pipelines for Inference # Translate this + ... + title: Tutorials # Translate this + ``` -> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu. +2. Once you’ve translated the `_toctree.yml`, move on to translating the associated MDX files. + +## Collaborate and share + +If you'd like assistance with your translation, open an issue and tag `@stevhliu`. Feel free to share resources or glossaries to ensure consistent terminology. From f38531619ddff23a510d5f7ccbc257a1bb1a3cb7 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 31 Oct 2024 20:55:53 +0800 Subject: [PATCH 160/385] enable QA bf16 pipeline (#34483) * enable QA bf16 pipeline * add tests --- .../pipelines/question_answering.py | 10 ++++-- .../test_pipelines_question_answering.py | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index 6039e5ad1ee989..7b876eefc49279 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -540,8 +540,14 @@ def postprocess( min_null_score = 1000000 # large and positive answers = [] for output in model_outputs: - start_ = output["start"] - end_ = output["end"] + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + start_ = output["start"].to(torch.float32) + else: + start_ = output["start"] + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + end_ = output["end"].to(torch.float32) + else: + end_ = output["end"] example = output["example"] p_mask = output["p_mask"] attention_mask = ( diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py index d06f88d1f08844..bf4fc7db1db6b5 100644 --- a/tests/pipelines/test_pipelines_question_answering.py +++ b/tests/pipelines/test_pipelines_question_answering.py @@ -27,6 +27,7 @@ from transformers.testing_utils import ( compare_pipeline_output_to_hub_spec, is_pipeline_test, + is_torch_available, nested_simplify, require_tf, require_torch, @@ -34,6 +35,10 @@ slow, ) + +if is_torch_available(): + import torch + from .test_pipelines_common import ANY @@ -165,6 +170,34 @@ def test_small_model_pt(self): self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + @require_torch + def test_small_model_pt_fp16(self): + question_answerer = pipeline( + "question-answering", + model="sshleifer/tiny-distilbert-base-cased-distilled-squad", + torch_dtype=torch.float16, + ) + + outputs = question_answerer( + question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." + ) + + self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + + @require_torch + def test_small_model_pt_bf16(self): + question_answerer = pipeline( + "question-answering", + model="sshleifer/tiny-distilbert-base-cased-distilled-squad", + torch_dtype=torch.bfloat16, + ) + + outputs = question_answerer( + question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." + ) + + self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + @require_torch def test_small_model_pt_iterator(self): # https://github.com/huggingface/transformers/issues/18510 From 1b86772de5d8429ff5ffe5caf7855548d2f4f072 Mon Sep 17 00:00:00 2001 From: jp Date: Thu, 31 Oct 2024 22:32:45 +0900 Subject: [PATCH 161/385] Fix: img size mismatch caused by incorrect unpadding in LLaVA-Next (#34522) Fix: unpadding img mismatch --- src/transformers/models/llava_next/modeling_llava_next.py | 4 ++-- .../models/llava_next_video/modeling_llava_next_video.py | 4 ++-- .../models/llava_onevision/modeling_llava_onevision.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 5a49337b2b5d96..2d23c48225cd00 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -138,12 +138,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 85c109919da736..a2328c1d2d92cb 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -143,12 +143,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 2aa6b2fa1d6fa5..626db4d96aae2e 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -139,12 +139,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] From dca93ca076c68372dcf3ad1239a2119afdda629c Mon Sep 17 00:00:00 2001 From: kibitzing Date: Thu, 31 Oct 2024 22:53:23 +0900 Subject: [PATCH 162/385] Fix step shifting when accumulate gradient (#33673) * replace total_batched_samples with step while counting grad accum step * remove unused variable * simplify condition for update step * fix format by ruff * simplify update step condition using accelerator.sync_gradients * simplify update condition using do_sync_step * remove print for test --------- Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e2ae622e2b6bf3..30caa2de260cb7 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2404,7 +2404,6 @@ def _inner_training_loop( if args.eval_on_start: self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True) - total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): epoch_dataloader = train_dataloader if hasattr(epoch_dataloader, "set_epoch"): @@ -2447,13 +2446,7 @@ def _inner_training_loop( batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches) for inputs in batch_samples: step += 1 - total_batched_samples += 1 - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) - do_sync_step = is_last_step_and_steps_less_than_grad_acc or ( - total_batched_samples % args.gradient_accumulation_steps == 0 - ) + do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch # Since we perform prefetching, we need to manually set sync_gradients if not do_sync_step: self.accelerator.gradient_state._set_sync_gradients(False) From ab98f0b0a1cd90b1c72948daf83c098037212fc4 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:36:13 +0100 Subject: [PATCH 163/385] avoid calling `gc.collect` and `cuda.empty_cache` (#34514) * update * update * update * update * update --------- Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 8 ++++++++ tests/models/clvp/test_feature_extraction_clvp.py | 12 ++++++++---- tests/models/clvp/test_modeling_clvp.py | 14 +++++--------- tests/models/ctrl/test_modeling_ctrl.py | 9 +++------ tests/models/gpt2/test_modeling_gpt2.py | 9 +++------ .../gpt_bigcode/test_modeling_gpt_bigcode.py | 8 ++++---- tests/models/idefics2/test_modeling_idefics2.py | 5 ++--- tests/models/idefics3/test_modeling_idefics3.py | 6 ++---- tests/models/llama/test_modeling_llama.py | 6 ++---- tests/models/llava/test_modeling_llava.py | 5 ++--- .../models/llava_next/test_modeling_llava_next.py | 5 ++--- .../test_modeling_llava_next_video.py | 5 ++--- .../test_modeling_llava_onevision.py | 5 ++--- tests/models/mistral/test_modeling_mistral.py | 7 +++---- tests/models/mllama/test_modeling_mllama.py | 5 ++--- tests/models/paligemma/test_modeling_paligemma.py | 5 ++--- .../qwen2_audio/test_modeling_qwen2_audio.py | 5 ++--- tests/models/rag/test_modeling_rag.py | 11 ++++------- tests/models/sam/test_modeling_sam.py | 6 ++---- tests/models/univnet/test_modeling_univnet.py | 6 ++---- .../video_llava/test_modeling_video_llava.py | 5 ++--- tests/models/vipllava/test_modeling_vipllava.py | 13 +++++++++---- tests/models/wav2vec2/test_modeling_wav2vec2.py | 6 ++---- tests/models/xglm/test_modeling_xglm.py | 5 ++--- 24 files changed, 77 insertions(+), 94 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 0eef286732d81c..8d6c1b19377eca 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -16,6 +16,7 @@ import contextlib import doctest import functools +import gc import importlib import inspect import logging @@ -2679,3 +2680,10 @@ def compare_pipeline_output_to_hub_spec(output, hub_spec): if unexpected_keys: error.append(f"Keys in pipeline output that are not in Hub spec: {unexpected_keys}") raise KeyError("\n".join(error)) + + +@require_torch +def cleanup(device: str, gc_collect=False): + if gc_collect: + gc.collect() + backend_empty_cache(device) diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index db641eaf6145cb..1f059ca46944e1 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import itertools import os import random @@ -24,7 +23,13 @@ from datasets import Audio, load_dataset from transformers import ClvpFeatureExtractor -from transformers.testing_utils import check_json_file_has_correct_format, require_torch, slow +from transformers.testing_utils import ( + check_json_file_has_correct_format, + cleanup, + require_torch, + slow, + torch_device, +) from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -116,8 +121,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained def test_feat_extract_from_and_save_pretrained(self): diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 0cf89a74523364..12e58500063a17 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Clvp model.""" -import gc import tempfile import unittest @@ -23,6 +22,7 @@ from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig from transformers.testing_utils import ( + cleanup, require_torch, slow, torch_device, @@ -174,8 +174,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_config(self): self.encoder_config_tester.run_common_tests() @@ -294,8 +293,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() @@ -421,8 +419,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() @@ -571,8 +568,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) def test_conditional_encoder(self): with torch.no_grad(): diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index a9bdddd7bfe25e..88efa9bb189161 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -13,11 +13,10 @@ # limitations under the License. -import gc import unittest from transformers import CTRLConfig, is_torch_available -from transformers.testing_utils import backend_empty_cache, require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -235,8 +234,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() @@ -261,8 +259,7 @@ class CTRLModelLanguageGenerationTest(unittest.TestCase): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) @slow def test_lm_generate_ctrl(self): diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 3f96c20ab2dbd9..012444b472c0fc 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -15,7 +15,6 @@ import datetime -import gc import math import unittest @@ -23,7 +22,7 @@ from transformers import GPT2Config, is_torch_available from transformers.testing_utils import ( - backend_empty_cache, + cleanup, require_flash_attn, require_torch, require_torch_gpu, @@ -542,8 +541,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() @@ -753,8 +751,7 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) def _test_lm_generate_gpt2_helper( self, diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 9d7750f5cf20cc..1db484c4062c35 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -18,7 +18,7 @@ from parameterized import parameterized from transformers import GPTBigCodeConfig, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -422,9 +422,9 @@ def setUp(self): self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37) def tearDown(self): - import gc - - gc.collect() + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 042fecf4bd25f7..0b0f3c1f3d8483 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Idefics2 model.""" import copy -import gc import tempfile import unittest from io import BytesIO @@ -31,6 +30,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_flash_attn, require_torch, @@ -583,8 +583,7 @@ def setUp(self): ) def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_torch_multi_gpu diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 5dc352d22fe0c0..dc5aad2fd04395 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Idefics3 model.""" import copy -import gc import unittest from io import BytesIO @@ -26,7 +25,7 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_bitsandbytes, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -497,8 +496,7 @@ def setUp(self): ) def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @unittest.skip("multi-gpu tests are disabled for now") diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 375ec1dd3e6f3a..9e67f4f7381e24 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch LLaMA model.""" -import gc import tempfile import unittest @@ -25,7 +24,7 @@ from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( - backend_empty_cache, + cleanup, require_flash_attn, require_read_token, require_torch, @@ -891,8 +890,7 @@ def test_export_static_cache(self): @require_torch_accelerator class Mask4DTestHard(unittest.TestCase): def tearDown(self): - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) def setUp(self): model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 1a17f18de34234..af0eddcd35b897 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava model.""" -import gc import unittest import requests @@ -28,6 +27,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, require_torch_gpu, @@ -307,8 +307,7 @@ def setUp(self): self.processor = AutoProcessor.from_pretrained("llava-hf/bakLlava-v1-hf") def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_bitsandbytes diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index e088b2505366f6..e960f9f6759981 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava-NeXT model.""" -import gc import unittest import requests @@ -28,6 +27,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, slow, @@ -370,8 +370,7 @@ def setUp(self): self.prompt = "[INST] \nWhat is shown in this image? [/INST]" def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_bitsandbytes diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index edf1dd2d4c07a4..89cdce65ece95d 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava-NeXT-Video model.""" -import gc import unittest import numpy as np @@ -29,6 +28,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, slow, @@ -400,8 +400,7 @@ def setUp(self): self.prompt_video = "USER: