From 8a08b6bdd0033fb22f53b5a9c79d13103ecb5813 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 30 Oct 2024 09:57:33 +0100
Subject: [PATCH 01/15] fix tests

---
 .../llava_next_video/modular_llava_next_video.py    | 13 ++++++++++++-
 .../models/video_llava/modeling_video_llava.py      | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index e9974e920493ff..c1ed7571941b9e 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -623,6 +623,17 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
+        if input_ids is not None:
+            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
+                1
+            ).max() < self.config.video_seq_length
+            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
+                video_token_not_enough and pixel_values_videos is not None
+            )
+
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -635,7 +646,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if cache_position[0] == 0:
+        if legacy_processing or cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 30f82e45056c77..a9bd8b745a6f68 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -720,6 +720,17 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
+        if input_ids is not None:
+            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
+                1
+            ).max() < self.config.video_seq_length
+            legacy_processing = (img_token_not_enough and pixel_values_images is not None) or (
+                video_token_not_enough and pixel_values_videos is not None
+            )
+
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -730,7 +741,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if cache_position[0] == 0:
+        if legacy_processing or cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values_images"] = pixel_values_images

From a022e60df2655f37b9d7b5ead4a66d9df0ed360e Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 30 Oct 2024 09:59:25 +0100
Subject: [PATCH 02/15] [run-slow] llava_next_video


From b0e1c7c3764509b2d4d3cd53e5f4d4ef0950a7fb Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 30 Oct 2024 10:26:24 +0100
Subject: [PATCH 03/15] fix copies

---
 .../llava_next_video/modeling_llava_next_video.py   | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 44b372535d70bd..96f4373afd9ec6 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -1110,6 +1110,17 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
+        if input_ids is not None:
+            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
+                1
+            ).max() < self.config.video_seq_length
+            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
+                video_token_not_enough and pixel_values_videos is not None
+            )
+
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -1122,7 +1133,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if cache_position[0] == 0:
+        if legacy_processing or cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes

From 2bfd72217dcc03ab7ba99974ab38822cd5cee795 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 30 Oct 2024 10:26:26 +0100
Subject: [PATCH 04/15] [run-slow] llava_next_video


From 931b03a393955f709b18e10b0153d253882d2b50 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 22 Nov 2024 14:40:42 +0100
Subject: [PATCH 05/15] remove legacy in all models

---
 .../models/llava/modeling_llava.py            |  56 -------
 .../models/llava/processing_llava.py          |  30 ++--
 .../models/llava_next/modeling_llava_next.py  |  61 --------
 .../llava_next/processing_llava_next.py       |  34 ++--
 .../modular_llava_next_video.py               | 145 ++++--------------
 .../processing_llava_next_video.py            |  61 ++++----
 .../video_llava/modeling_video_llava.py       | 145 ++++--------------
 .../video_llava/processing_video_llava.py     |  15 +-
 .../models/vipllava/modeling_vipllava.py      |  54 -------
 tests/models/llava/test_modeling_llava.py     |  33 +---
 .../models/vipllava/test_modeling_vipllava.py |  32 ++--
 11 files changed, 128 insertions(+), 538 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index a0079f1787a2e9..626a7146e537fb 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -461,18 +461,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values,
@@ -480,53 +471,6 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            # prefill stage vs decoding stage (legacy behavior copied)
-            if input_ids.shape[1] != 1:
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
             n_image_features = image_features.shape[1]
             if n_image_tokens != n_image_features:
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 8a9597892c6021..d1b0beb2d0f698 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -143,25 +143,17 @@ def __call__(
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
         if image_inputs.get("pixel_values") is not None:
-            if self.patch_size is not None and self.vision_feature_select_strategy is not None:
-                # Replace the image token with the expanded image token sequence
-                pixel_values = image_inputs["pixel_values"]
-                height, width = get_image_size(to_numpy_array(pixel_values[0]))
-                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
-                if self.vision_feature_select_strategy == "default":
-                    num_image_tokens -= 1
-
-                prompt_strings = []
-                for sample in text:
-                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-                    prompt_strings.append(sample)
-            else:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-                )
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         return BatchFeature(data={**text_inputs, **image_inputs})
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 5a49337b2b5d96..cd9763f02b5135 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -835,18 +835,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -863,58 +854,6 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
             n_image_features = image_features.shape[0]
             if n_image_tokens != n_image_features:
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index ce11be6d6309a8..ac4396ff10ab9b 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -138,27 +138,19 @@ def __call__(
 
         prompt_strings = text
         if image_inputs:
-            if self.patch_size is None or self.vision_feature_select_strategy is None:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-                )
-            else:
-                image_sizes = iter(image_inputs["image_sizes"])
-                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
-                prompt_strings = []
-                for sample in text:
-                    while self.image_token in sample:
-                        image_size = next(image_sizes)
-                        orig_height, orig_width = image_size
-                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
-                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= 1
-                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
-                    prompt_strings.append(sample)
-                prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+                    sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                prompt_strings.append(sample)
+            prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index c1ed7571941b9e..691fa92766fc92 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -427,25 +427,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -460,7 +444,21 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = video_feature_lens = None
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self.get_video_features(
                 pixel_values_videos,
@@ -472,94 +470,20 @@ def forward(
             video_features = torch.cat(video_features, dim=0)
             video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                iterator = (
-                    (image_features, feature_lens, self.config.image_token_index),
-                    (video_features, video_feature_lens, self.config.video_token_index),
-                )
-                for features, lens, special_token in iterator:
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            features,
-                            lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=special_token,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if image_features is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-            if video_features is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (
+                (input_ids == self.config.video_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -623,17 +547,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -646,7 +559,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index e0e4534e42b565..3bedf6e8f51724 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -161,42 +161,33 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is None or self.vision_feature_select_strategy is None:
-            logger.warning_once(
-                "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-        else:
-            # images expand taking into account num_of_patches in each image
-            if image_inputs:
-                image_sizes = iter(image_inputs["image_sizes"])
-                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
-                prompt_strings = []
-                for sample in text:
-                    while self.image_token in sample:
-                        image_size = next(image_sizes)
-                        orig_height, orig_width = image_size
-                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
-                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= 1
-                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
-                    prompt_strings.append(sample)
-                text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+        if image_inputs:
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+                    sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                prompt_strings.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
 
-            # videos are easier, simply get frames and multiply
-            if videos_inputs:
-                one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
-                height, width = get_image_size(one_video[0])
-                num_frames = one_video.shape[0]  # frame dim is always after batch dim
-                num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
-                num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
-                prompt_strings = []
-                for sample in text:
-                    sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
-                    prompt_strings.append(sample)
-                text = prompt_strings
+        # videos are easier, simply get frames and multiply
+        if videos_inputs:
+            one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
+            height, width = get_image_size(one_video[0])
+            num_frames = one_video.shape[0]  # frame dim is always after batch dim
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
+            num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                prompt_strings.append(sample)
+            text = prompt_strings
 
         text_inputs = self.tokenizer(
             text,
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index a9bd8b745a6f68..0537736087b169 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -537,127 +537,49 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values_images is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (
-                pixel_values_images is not None or pixel_values_videos is not None
-            )
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = None
         if pixel_values_images is not None:
             image_features = self.get_image_features(
                 pixel_values_images,
                 vision_feature_layer=vision_feature_layer,
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
+            n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
+            n_image_features = image_features.shape[1]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        video_features = None
-        num_frames = 0
         if pixel_values_videos is not None:
             video_features, num_frames = self.get_video_features(
                 pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                for features, frames in ((image_features, 1), (video_features, num_frames)):
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            labels,
-                            position_ids,
-                            input_ids,
-                        ) = self._merge_input_ids_with_visual_features(
-                            features,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            labels,
-                            num_frames=frames,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if pixel_values_images is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-                n_image_features = image_features.shape[1]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item()
+            n_video_features = video_features.shape[1]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-            if pixel_values_videos is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item()
-                n_video_features = video_features.shape[1]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (
+                (input_ids == self.config.video_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -720,17 +642,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values_images is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -741,7 +652,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values_images"] = pixel_values_images
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index bd6f91270965bb..057324b28bd5b9 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -62,8 +62,8 @@ def __init__(
         self,
         image_processor=None,
         tokenizer=None,
-        patch_size=None,
-        vision_feature_select_strategy=None,
+        patch_size=14,
+        vision_feature_select_strategy="default",
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
         video_token="<video>",
         chat_template=None,
@@ -146,15 +146,8 @@ def __call__(
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         prompt_strings = text
-        if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
-            logger.warning_once(
-                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
-            )
-        # Replace the image/video tokens with the expanded token sequence
-        elif encoded_images is not None:
+
+        if encoded_images is not None:
             if "pixel_values_images" in encoded_images.keys():
                 height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values_images")[0]))
                 num_frames = 1
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index c9db6e261c6a72..7c7205621eb02d 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -455,68 +455,14 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            # prefill stage vs decoding stage (legacy behavior copied)
-            if input_ids.shape[1] != 1:
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # in the case one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
             n_image_features = image_features.shape[1]
             if n_image_tokens != n_image_features:
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 405fad1bd31c8d..51cd8b4dc27335 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -292,9 +292,6 @@ def test_small_model_integration_test(self):
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
         inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
 
-        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
-        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
-
         output = model.generate(**inputs, max_new_tokens=20)
         EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
 
@@ -471,32 +468,18 @@ def test_llava_merge_inputs_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
         model_id = "llava-hf/llava-1.5-7b-hf"
         model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
 
-        # Simulate some user inputs
-        pixel_values = torch.randn(
-            (1, 3, 336, 336),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        input_ids = torch.tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
-        )
+        prompt = "USER: <image>\nDescribe the imageASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that the loss is properly computed
         loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
+            **inputs,
+            labels=inputs.input_ids.clone(),
         ).loss
         loss.backward()
 
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 2c241c23f26158..ffd0868c5789ad 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -280,7 +280,7 @@ def test_small_model_integration_test(self):
 
         outputs = model.generate(**inputs, max_new_tokens=10)
 
-        EXPECTED_OUTPUT = "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
+        EXPECTED_OUTPUT = "USER:  \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
         self.assertEqual(processor.decode(outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
     @slow
@@ -289,32 +289,18 @@ def test_vipllava_merge_inputs_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
         model_id = "llava-hf/vip-llava-7b-hf"
         model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
 
-        # Simulate some user inputs
-        pixel_values = torch.randn(
-            (1, 3, 336, 336),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        input_ids = torch.tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
-        )
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+        prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
+
+        inputs = processor(prompt, image, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that the loss is properly computed
         loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
+            **inputs,
+            labels=inputs.input_ids.clone(),
         ).loss
         loss.backward()
 

From 5fe6fa99f3c76f8f53a3f9db23ba42dde6f407a2 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 22 Nov 2024 14:52:16 +0100
Subject: [PATCH 06/15] also blip models

---
 .../models/blip_2/modeling_blip_2.py          |  57 +------
 .../models/blip_2/processing_blip_2.py        |  31 ++--
 .../instructblip/modeling_instructblip.py     |  59 +------
 .../instructblip/processing_instructblip.py   |  33 ++--
 .../modeling_instructblipvideo.py             |  59 +------
 .../modular_instructblipvideo.py              |  59 +------
 .../processing_instructblipvideo.py           |  36 ++---
 .../modeling_llava_next_video.py              | 145 ++++--------------
 .../video_llava/processing_video_llava.py     |   4 +-
 9 files changed, 84 insertions(+), 399 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index eba82cd1b3c8e4..19aa5e748b0d94 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -2186,22 +2186,9 @@ def forward(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
-            )
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -2309,30 +2296,8 @@ def generate(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -2340,18 +2305,6 @@ def generate(
             **generate_kwargs,
         )
 
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
-        if not self.language_model.config.is_encoder_decoder:
-            bos_tokens = (
-                torch.LongTensor([[self.config.text_config.bos_token_id]])
-                .repeat(batch_size, 1)
-                .to(image_embeds.device)
-            )
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
         return outputs
 
 
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index fa6a99f71a4616..a0104fe1514394 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -133,31 +133,22 @@ def __call__(
             _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
             output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
 
-            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
-            # because BLIP expects image tokens to be at the beginning even before BOS token
-            if self.num_query_tokens is not None:
-                image_tokens = self.image_token.content * self.num_query_tokens
-                image_token_encoding = self.tokenizer(
-                    [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
-                )
-                for k in _text_encoding:
-                    text_encoding[k] = [
-                        img_encoding + txt_encoding
-                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
-                    ]
-            else:
-                text_encoding = _text_encoding
-                logger.warning_once(
-                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
-                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-                )
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            image_tokens = self.image_token.content * self.num_query_tokens
+            image_token_encoding = self.tokenizer(
+                [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
+            )
+            for k in _text_encoding:
+                text_encoding[k] = [
+                    img_encoding + txt_encoding
+                    for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                ]
 
             # cast to desired return tensors type
             encoding.update(BatchEncoding(text_encoding, tensor_type=return_tensors))
+        
         # add pixel_values encoding. If we also have text_encoding, update image encoding and return it.
         # else, return the text encoding.
-
         if images is not None:
             image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
             encoding.update(image_encoding)
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 5cce774ce0716a..5490d085b1cb1a 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1462,21 +1462,8 @@ def forward(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
-            )
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1601,29 +1588,8 @@ def generate(
 
         inputs_embeds = self.get_input_embeddings()(input_ids)
 
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -1631,21 +1597,4 @@ def generate(
             **generate_kwargs,
         )
 
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
-        if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
-
         return outputs
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 05ff9871f4d731..875974406f1c8d 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -126,27 +126,18 @@ def __call__(
             return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
             _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
             output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
-            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
-            # because BLIP expects image tokens to be at the beginning even before BOS token
-            if self.num_query_tokens is not None and images is not None:
-                text_encoding = {}
-                image_tokens = self.image_token.content * self.num_query_tokens
-                image_token_encoding = self.tokenizer(
-                    [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
-                )
-                for k in _text_encoding:
-                    text_encoding[k] = [
-                        img_encoding + txt_encoding
-                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
-                    ]
-            else:
-                text_encoding = _text_encoding
-                if images is not None:
-                    logger.warning_once(
-                        "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
-                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-                    )
+
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            text_encoding = {}
+            image_tokens = self.image_token.content * self.num_query_tokens
+            image_token_encoding = self.tokenizer(
+                [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
+            )
+            for k in _text_encoding:
+                text_encoding[k] = [
+                    img_encoding + txt_encoding
+                    for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                ]
 
             # cast to desired return tensors type after concatenating
             text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index c9f12391666c22..8fc4e5f570ec52 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1489,21 +1489,8 @@ def forward(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # if the model already has "video_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "video_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
-            )
+        special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1636,29 +1623,8 @@ def generate(
 
         inputs_embeds = self.get_input_embeddings()(input_ids)
 
-        # if the model already has "video_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "video_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -1666,21 +1632,4 @@ def generate(
             **generate_kwargs,
         )
 
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
-        if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
-
         return outputs
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 2128f25df6625d..5455a79027fb50 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -297,21 +297,8 @@ def forward(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-        # if the model already has "video_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "video_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
-            )
+        special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -444,29 +431,8 @@ def generate(
 
         inputs_embeds = self.get_input_embeddings()(input_ids)
 
-        # if the model already has "video_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "video_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
@@ -474,21 +440,4 @@ def generate(
             **generate_kwargs,
         )
 
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
-        if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
-
         return outputs
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 3e96d279a42f8d..281b9ceb3e20cc 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -124,29 +124,19 @@ def __call__(
                 **kwargs,
             )
 
-            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
-            # because BLIP expects image tokens to be at the beginning even before BOS token
-            if self.num_query_tokens is not None and images is not None:
-                text_encoding = {}
-                video_tokens = (
-                    self.video_token.content * self.num_query_tokens * 4
-                )  # InstrucBLIP works with 4 frames only
-                video_token_encoding = self.tokenizer(
-                    [video_tokens] * len(text), add_special_tokens=False, return_tensors=None
-                )
-                for k in _text_encoding:
-                    text_encoding[k] = [
-                        img_encoding + txt_encoding
-                        for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k])
-                    ]
-            else:
-                text_encoding = _text_encoding
-                if images is not None:
-                    logger.warning_once(
-                        "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
-                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
-                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-                    )
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            text_encoding = {}
+
+            # InstrucBLIP works with 4 frames only
+            video_tokens = self.video_token.content * self.num_query_tokens * 4
+            video_token_encoding = self.tokenizer(
+                [video_tokens] * len(text), add_special_tokens=False, return_tensors=None
+            )
+            for k in _text_encoding:
+                text_encoding[k] = [
+                    img_encoding + txt_encoding
+                    for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k])
+                ]
 
             # cast to desired return tensors type after concatenating
             text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 96f4373afd9ec6..396aac413c1dce 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -914,25 +914,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -947,7 +931,21 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = video_feature_lens = None
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (
+                (input_ids == self.config.image_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self.get_video_features(
                 pixel_values_videos,
@@ -959,94 +957,20 @@ def forward(
             video_features = torch.cat(video_features, dim=0)
             video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                iterator = (
-                    (image_features, feature_lens, self.config.image_token_index),
-                    (video_features, video_feature_lens, self.config.video_token_index),
-                )
-                for features, lens, special_token in iterator:
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            features,
-                            lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=special_token,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if image_features is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-            if video_features is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (
+                (input_ids == self.config.video_token_index)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -1110,17 +1034,6 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- extra custom processing
 
-        if input_ids is not None:
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            legacy_processing = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -1133,7 +1046,7 @@ def prepare_inputs_for_generation(
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
         # Otherwise we need pixel values to be passed to model
-        if legacy_processing or cache_position[0] == 0:
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
             model_inputs["image_sizes"] = image_sizes
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 057324b28bd5b9..e1c041f310bd77 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -40,9 +40,9 @@ class VideoLlavaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
-        patch_size (`int`, *optional*):
+        patch_size (`int`, *optional*, defaults to 14):
             Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Shoudl be same as in model's config
         image_token (`str`, *optional*, defaults to `"<image>"`):

From c7a33f57001ed5d7ace4031bbfc82159cf20ec2a Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 22 Nov 2024 14:53:14 +0100
Subject: [PATCH 07/15] [run-slow] blip_2,instructblip,llava_next,video_llava


From cc88f180666b9ef5310391927c00d55dc2bfc794 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 22 Nov 2024 15:56:26 +0100
Subject: [PATCH 08/15] green CI hopefully

---
 .../models/blip_2/modeling_blip_2.py          |  8 +--
 .../models/blip_2/processing_blip_2.py        | 28 ++++-------
 .../instructblip/modeling_instructblip.py     |  6 ---
 .../instructblip/processing_instructblip.py   | 24 +++------
 .../modeling_instructblipvideo.py             |  6 ---
 .../modular_instructblipvideo.py              |  6 ---
 .../processing_instructblipvideo.py           | 30 +++++-------
 tests/models/blip_2/test_processor_blip_2.py  | 27 ++++------
 .../test_processor_instructblip.py            | 49 +++++++++----------
 .../test_processor_instructblipvideo.py       | 49 +++++++++----------
 tests/models/llava/test_processor_llava.py    |  2 +-
 .../llava_next/test_processor_llava_next.py   |  6 +--
 .../video_llava/test_modeling_video_llava.py  | 18 +------
 tests/test_processing_common.py               | 27 ++++++----
 14 files changed, 106 insertions(+), 180 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 19aa5e748b0d94..d74bed88dafd89 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -2179,9 +2179,7 @@ def forward(
 
         # step 3: use the language model, conditioned on the query outputs and the prompt
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
+
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -2283,9 +2281,7 @@ def generate(
         query_output = query_outputs.last_hidden_state
 
         language_model_inputs = self.language_projection(query_output)
-        language_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
+
         if input_ids is None:
             input_ids = (
                 torch.LongTensor([[self.config.text_config.bos_token_id]])
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index a0104fe1514394..872f798e4257a7 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -78,6 +78,10 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.add_tokens([self.image_token], special_tokens=True)
         self.num_query_tokens = num_query_tokens
 
+        # We'll add the BOS manually as it has to be after image tokens
+        tokenizer.add_bos_token = False
+        self.bos_token = tokenizer.bos_token
+
         super().__init__(image_processor, tokenizer)
 
     def __call__(
@@ -115,11 +119,9 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+
         # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-        else:
-            return_tensors = None
+        return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
         encoding = BatchFeature(tensor_type=return_tensors)
         if text is not None:
             if isinstance(text, str):
@@ -127,26 +129,14 @@ def __call__(
             elif not isinstance(text, list) and not isinstance(text[0], str):
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-            text_encoding = {}
-
-            return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
-            _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
-            output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
-
             # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
             image_tokens = self.image_token.content * self.num_query_tokens
-            image_token_encoding = self.tokenizer(
-                [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
-            )
-            for k in _text_encoding:
-                text_encoding[k] = [
-                    img_encoding + txt_encoding
-                    for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
-                ]
+            text = [f"{image_tokens}{self.bos_token}{sample}" for sample in text]
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
             # cast to desired return tensors type
             encoding.update(BatchEncoding(text_encoding, tensor_type=return_tensors))
-        
+
         # add pixel_values encoding. If we also have text_encoding, update image encoding and return it.
         # else, return the text encoding.
         if images is not None:
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 5490d085b1cb1a..13f0a1e29c2efa 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1454,9 +1454,6 @@ def forward(
 
         # step 3: use the language model, conditioned on the query outputs and the prompt
         language_model_inputs = self.language_projection(query_output)
-        language_model_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if attention_mask is None:
@@ -1573,9 +1570,6 @@ def generate(
         query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :]
 
         language_model_inputs = self.language_projection(query_output)
-        language_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         if input_ids is None:
             input_ids = (
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 875974406f1c8d..4aadcd0c555359 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -24,7 +24,6 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import (
     AddedToken,
-    BatchEncoding,
     PreTokenizedInput,
     TextInput,
 )
@@ -81,6 +80,10 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_toke
         self.image_token = AddedToken("<image>", normalized=False, special=True)
         tokenizer.add_tokens([self.image_token], special_tokens=True)
         self.num_query_tokens = num_query_tokens
+
+        # We'll add the BOS manually as it has to be after image tokens
+        tokenizer.add_bos_token = False
+        self.bos_token = tokenizer.bos_token
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
     def __call__(
@@ -122,25 +125,10 @@ def __call__(
             elif not isinstance(text, list) and not isinstance(text[0], str):
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-            # we have to concatenate lists - so we keep track of return_tensors here
-            return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
-            _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
-            output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
-
             # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
-            text_encoding = {}
             image_tokens = self.image_token.content * self.num_query_tokens
-            image_token_encoding = self.tokenizer(
-                [image_tokens] * len(text), add_special_tokens=False, return_tensors=None
-            )
-            for k in _text_encoding:
-                text_encoding[k] = [
-                    img_encoding + txt_encoding
-                    for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
-                ]
-
-            # cast to desired return tensors type after concatenating
-            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
+            text_with_images = [f"{image_tokens}{self.bos_token}{sample}" for sample in text]
+            text_encoding = self.tokenizer(text_with_images, **output_kwargs["text_kwargs"])
 
             encoding.update(text_encoding)
             qformer_text_encoding = self.qformer_tokenizer(text, **output_kwargs["text_kwargs"])
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 8fc4e5f570ec52..2e185e6fd64f8c 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1481,9 +1481,6 @@ def forward(
 
         # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
         language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
-        language_model_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if attention_mask is None:
@@ -1608,9 +1605,6 @@ def generate(
 
         # unbatch the embeddings back by moving frames to seq-len
         language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
-        language_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         if input_ids is None:
             input_ids = (
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 5455a79027fb50..640979d441370e 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -289,9 +289,6 @@ def forward(
 
         # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
         language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
-        language_model_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
         if attention_mask is None:
@@ -416,9 +413,6 @@ def generate(
 
         # unbatch the embeddings back by moving frames to seq-len
         language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
-        language_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
-        )
 
         if input_ids is None:
             input_ids = (
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 281b9ceb3e20cc..6cc8bb1bfb774c 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -24,7 +24,6 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import (
     AddedToken,
-    BatchEncoding,
     PaddingStrategy,
     PreTokenizedInput,
     TextInput,
@@ -66,6 +65,10 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_toke
         self.video_token = AddedToken("<video>", normalized=False, special=True)
         tokenizer.add_tokens([self.video_token], special_tokens=True)
         self.num_query_tokens = num_query_tokens
+
+        # We'll add the BOS manually as it has to be after image tokens
+        tokenizer.add_bos_token = False
+        self.bos_token = tokenizer.bos_token
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
     def __call__(
@@ -105,8 +108,13 @@ def __call__(
             elif not isinstance(text, list) and not isinstance(text[0], str):
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-            _text_encoding = self.tokenizer(
-                text=text,
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            # InstrucBLIP works with 4 frames only
+            video_tokens = self.video_token.content * self.num_query_tokens * 4
+            text_with_videos = [f"{video_tokens}{self.bos_token}{sample}" for sample in text]
+
+            text_encoding = self.tokenizer(
+                text=text_with_videos,
                 add_special_tokens=add_special_tokens,
                 padding=padding,
                 truncation=truncation,
@@ -124,22 +132,6 @@ def __call__(
                 **kwargs,
             )
 
-            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
-            text_encoding = {}
-
-            # InstrucBLIP works with 4 frames only
-            video_tokens = self.video_token.content * self.num_query_tokens * 4
-            video_token_encoding = self.tokenizer(
-                [video_tokens] * len(text), add_special_tokens=False, return_tensors=None
-            )
-            for k in _text_encoding:
-                text_encoding[k] = [
-                    img_encoding + txt_encoding
-                    for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k])
-                ]
-
-            # cast to desired return tensors type after concatenating
-            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
             encoding.update(text_encoding)
             qformer_text_encoding = self.qformer_tokenizer(
                 text=text,
diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py
index 7eb5bedc2be7a7..54f9de09861b78 100644
--- a/tests/models/blip_2/test_processor_blip_2.py
+++ b/tests/models/blip_2/test_processor_blip_2.py
@@ -47,6 +47,9 @@ def get_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
+    def prepare_processor_dict(self):
+        return {"num_query_tokens": 1}
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -81,26 +84,12 @@ def test_image_processor(self):
         for key in input_feat_extract.keys():
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str, return_token_type_ids=False)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key][0])
-
     def test_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -116,8 +105,9 @@ def test_processor(self):
     def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
 
@@ -129,8 +119,9 @@ def test_tokenizer_decode(self):
     def test_model_input_names(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = Blip2Processor(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
index ffec4b01112c2f..72586924992cd3 100644
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -58,6 +58,9 @@ def get_image_processor(self, **kwargs):
     def get_qformer_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
 
+    def prepare_processor_dict(self):
+        return {"num_query_tokens": 1}
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -87,9 +90,13 @@ def test_image_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         image_input = self.prepare_image_inputs()
@@ -100,35 +107,17 @@ def test_image_processor(self):
         for key in input_feat_extract.keys():
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-        qformer_tokenizer = self.get_qformer_tokenizer()
-
-        processor = InstructBlipProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-
-        input_str = ["lower newer"]
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
-        encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
-
-        for key in encoded_tokens.keys():
-            self.assertListEqual(encoded_tokens[key], encoded_processor[key])
-
-        for key in encoded_tokens_qformer.keys():
-            self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
-
     def test_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         input_str = "lower newer"
@@ -149,9 +138,13 @@ def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -165,9 +158,13 @@ def test_model_input_names(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         input_str = "lower newer"
diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
index d613d878223213..443a6e12b82338 100644
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -59,6 +59,9 @@ def get_image_processor(self, **kwargs):
     def get_qformer_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
 
+    def prepare_processor_dict(self):
+        return {"num_query_tokens": 1}
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -88,9 +91,13 @@ def test_image_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         image_input = self.prepare_image_inputs()
@@ -101,35 +108,17 @@ def test_image_processor(self):
         for key in input_feat_extract.keys():
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-        qformer_tokenizer = self.get_qformer_tokenizer()
-
-        processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
-        )
-
-        input_str = ["lower newer"]
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
-        encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
-
-        for key in encoded_tokens.keys():
-            self.assertListEqual(encoded_tokens[key], encoded_processor[key])
-
-        for key in encoded_tokens_qformer.keys():
-            self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
-
     def test_processor(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         input_str = "lower newer"
@@ -150,9 +139,13 @@ def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
@@ -166,9 +159,13 @@ def test_model_input_names(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
         qformer_tokenizer = self.get_qformer_tokenizer()
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = InstructBlipVideoProcessor(
-            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            qformer_tokenizer=qformer_tokenizer,
+            **processor_kwargs,
         )
 
         input_str = "lower newer"
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index d3a66a16df9a64..3e6c1a9a969f0b 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -50,7 +50,7 @@ def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template"}
+        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index 45faa24526305c..234e4791100054 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -27,7 +27,7 @@
 
 
 if is_vision_available():
-    from transformers import CLIPImageProcessor
+    from transformers import LlavaNextImageProcessor
 
 
 @require_vision
@@ -37,7 +37,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor()
+        image_processor = LlavaNextImageProcessor()
         tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
         processor_kwargs = self.prepare_processor_dict()
         processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
@@ -50,7 +50,7 @@ def get_image_processor(self, **kwargs):
         return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template"}
+        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index fd4c49f4a6966d..c872693fbf704b 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -184,22 +184,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def prepare_config_and_inputs_for_batched_test(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, _, pixel_values_videos = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1).to(torch_device)
-
-        # make sure no other special tokens are set
-        input_ids[(input_ids == 0) | (input_ids == 1)] = 3
-        input_ids[:, 0] = config.video_token_index
-        inputs_dict = {
-            "pixel_values_videos": pixel_values_videos,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
 
 @require_torch
 class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -332,7 +316,7 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     ),
                 )
 
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_batched_test()
+        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             config.output_hidden_states = True
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 9f0d88089129b8..de3378252b5d56 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -173,8 +173,9 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
@@ -194,8 +195,9 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             "image_processor", do_rescale=True, rescale_factor=-1
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -209,8 +211,9 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
@@ -227,8 +230,9 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
             "image_processor", do_rescale=True, rescale_factor=1
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -241,7 +245,8 @@ def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -263,7 +268,8 @@ def test_unstructured_kwargs_batched(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs(batch_size=2)
@@ -288,7 +294,8 @@ def test_doubly_passed_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = [self.prepare_text_inputs()]
@@ -306,7 +313,8 @@ def test_structured_kwargs_nested(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -329,7 +337,8 @@ def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()

From 38d118f35df1c7a985f40d3f8b94d8177bf89fc3 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Sun, 24 Nov 2024 12:32:20 +0100
Subject: [PATCH 09/15] update once more

---
 .../models/blip_2/modeling_blip_2.py          | 35 +------------------
 .../instructblip/modeling_instructblip.py     | 32 -----------------
 .../modeling_instructblipvideo.py             | 10 +++---
 .../modular_instructblipvideo.py              | 10 +++---
 tests/generation/test_utils.py                |  1 +
 5 files changed, 12 insertions(+), 76 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 3a8ff7bff4870a..b7c2512785f75f 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -2300,48 +2300,15 @@ def generate(
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
 
-<<<<<<< HEAD
         special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
         inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-=======
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
-
         inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
             inputs["input_ids"] = input_ids
 
         outputs = self.language_model.generate(**inputs, **generate_kwargs)
->>>>>>> upstream/main
+
         return outputs
 
 
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index fd4522f4cbd752..9c0c604539e378 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1582,45 +1582,13 @@ def generate(
 
         inputs_embeds = self.get_input_embeddings()(input_ids)
 
-<<<<<<< HEAD
         special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
         inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-=======
-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
-        else:
-            logger.warning_once(
-                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
-                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-            attention_mask = torch.cat(
-                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
-            )
-
-            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-            # -1 is to account for the prepended BOS after `generate.`
-            if not self.language_model.config.is_encoder_decoder:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-                )
-                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
-
         inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
             inputs["input_ids"] = input_ids
 
         outputs = self.language_model.generate(**inputs, **generate_kwargs)
->>>>>>> upstream/main
 
         return outputs
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index e5275d63185415..f071ffab975e54 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1620,10 +1620,10 @@ def generate(
         special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
         inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 57934e6ecc2f7d..5f74a582f847e5 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -433,10 +433,10 @@ def generate(
         special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
         inputs_embeds[special_image_mask] = language_model_inputs.flatten()
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 34adc132f8829a..5b9dd15e7bbbff 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -450,6 +450,7 @@ def test_greedy_generate(self):
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
+                print(output_generate.shape[-1], self.max_new_tokens, inputs_dict["input_ids"].shape[-1])
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1])
 
     @pytest.mark.generate

From a371498e25cab2b40f86826c27b265f7c2d23eb5 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 25 Nov 2024 10:39:02 +0100
Subject: [PATCH 10/15] fix video llava

---
 .../models/video_llava/modeling_video_llava.py            | 8 ++++----
 .../models/video_llava/processing_video_llava.py          | 2 +-
 tests/models/video_llava/test_modeling_video_llava.py     | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index ac7db5cf0b8b51..c7f2cea69b9adc 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -547,8 +547,8 @@ def forward(
                 vision_feature_layer=vision_feature_layer,
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
-            n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-            n_image_features = image_features.shape[1]
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
@@ -567,8 +567,8 @@ def forward(
                 pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
             )
 
-            n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item()
-            n_video_features = video_features.shape[1]
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0] * video_features.shape[1]
             if n_video_tokens != n_video_features:
                 raise ValueError(
                     f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index a52ba90af20919..fb34b215748f87 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -77,7 +77,7 @@ def __init__(
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
         video_token="<video>",
         chat_template=None,
-        num_additional_image_tokens=0,
+        num_additional_image_tokens=1,
         **kwargs,
     ):
         self.patch_size = patch_size
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 2d6630c8ffca94..eb4b9e3ed083d7 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -127,7 +127,6 @@ def __init__(
         self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
         self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
-        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return VideoLlavaConfig(

From 8b93a4a57edba35b3b407c814c66051e662119f9 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 25 Nov 2024 10:47:07 +0100
Subject: [PATCH 11/15] add image token index in blip tester

---
 tests/models/blip_2/test_modeling_blip_2.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index a141ef40be1959..37ef8645e51331 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1004,7 +1004,14 @@ def get_config(self):
 # this model tester uses an encoder-decoder language model (T5)
 class Blip2ModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -1021,6 +1028,7 @@ def __init__(
         self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
@@ -1043,6 +1051,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
         )
 
     def create_and_check_for_conditional_generation(

From c0107290e28c85c7184c4ebc1f3901f1f6da7565 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 25 Nov 2024 10:53:48 +0100
Subject: [PATCH 12/15] docstring style

---
 src/transformers/models/video_llava/processing_video_llava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index fb34b215748f87..4ef19387036b3c 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -51,7 +51,7 @@ class VideoLlavaProcessor(ProcessorMixin):
             Special token used to denote video location.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+        num_additional_image_tokens (`int`, *optional*, defaults to 1):
             Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
             extra tokens appended, no need to set this arg.
     """

From 4efcea3a760dd4c0efa95c9354dd194f1017fb0d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 25 Nov 2024 11:13:10 +0100
Subject: [PATCH 13/15] fix blip pipelines

---
 tests/utils/tiny_model_summary.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index f27f720ec3d593..6c36448e5aba7e 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -626,7 +626,7 @@
         "model_classes": [
             "Blip2ForConditionalGeneration"
         ],
-        "sha": "35e1ef43da3554af62eb29a7b3dbbef3f3bef48e"
+        "sha": "d0de11fd1f8ca481231c07ee0934924be96cb281"
     },
     "Blip2Model": {
         "tokenizer_classes": [

From 50423d76566fdcf3fca4b856f6423374c3a2c074 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 25 Nov 2024 11:18:53 +0100
Subject: [PATCH 14/15] fix some more

---
 src/transformers/models/llava/modeling_llava.py       | 4 ++--
 src/transformers/models/vipllava/modeling_vipllava.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 626a7146e537fb..07b61321d0de83 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -471,8 +471,8 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
-            n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-            n_image_features = image_features.shape[1]
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 7c7205621eb02d..9b9054b6bb36a4 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -463,8 +463,8 @@ def forward(
                 pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
             )
 
-            n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item()
-            n_image_features = image_features.shape[1]
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"

From bc768e2c721861697a3c73cd504f268ab7be86c1 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 2 Dec 2024 10:53:45 +0100
Subject: [PATCH 15/15] allow these attr in VLMs to be not used

---
 utils/check_config_attributes.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 9b8244c243fc4a..1087dcd003cf15 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -295,6 +295,9 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "unk_index",
         "mask_index",
         "image_token_index",  # for VLMs
+        "video_token_index",
+        "image_seq_length",
+        "video_seq_length",
         "image_size",
         "use_cache",
         "out_features",