From acf0885e738c3a41bf30642064f383152a4ccb5e Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 11 Jun 2024 15:03:24 +0200 Subject: [PATCH 01/16] squash in single commit --- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/instructblip.md | 1 + docs/source/en/model_doc/instructblipvideo.md | 59 + src/transformers/__init__.py | 28 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/instructblipvideo/__init__.py | 83 + .../configuration_instructblipvideo.py | 362 ++++ ...t_instructblipvideo_original_to_pytorch.py | 305 +++ .../diff_instructblipvideo.py | 399 ++++ .../image_processing_instructblipvideo.py | 358 ++++ .../modeling_instructblipvideo.py | 1648 +++++++++++++++++ .../processing_instructblipvideo.py | 170 ++ src/transformers/utils/dummy_pt_objects.py | 16 +- .../utils/dummy_vision_objects.py | 7 + tests/models/instructblipvideo/__init__.py | 0 .../test_modeling_instructblipvideo.py | 645 +++++++ utils/diff_model_converter.py | 39 +- 22 files changed, 4111 insertions(+), 18 deletions(-) create mode 100644 docs/source/en/model_doc/instructblipvideo.md create mode 100644 src/transformers/models/instructblipvideo/__init__.py create mode 100644 src/transformers/models/instructblipvideo/configuration_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py create mode 100644 src/transformers/models/instructblipvideo/diff_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/modeling_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/processing_instructblipvideo.py create mode 100644 tests/models/instructblipvideo/__init__.py create mode 100644 tests/models/instructblipvideo/test_modeling_instructblipvideo.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index be3001dc761a90..0963ff466ce529 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -774,6 +774,8 @@ title: Idefics2 - local: model_doc/instructblip title: InstructBLIP + - local: model_doc/instructblipvideo + title: InstructBlipVideo - local: model_doc/kosmos-2 title: KOSMOS-2 - local: model_doc/layoutlm diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 72237d13839569..a3f1fce954ed5d 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -165,6 +165,7 @@ Flax), PyTorch, and/or TensorFlow. | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | +| [InstructBlipVideo](model_doc/instructblipvideo) | ✅ | ❌ | ❌ | | [Jamba](model_doc/jamba) | ✅ | ❌ | ❌ | | [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ | | [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index 1a693493fff153..b5fc634b621626 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -50,6 +50,7 @@ InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but impor [[autodoc]] InstructBlipProcessor + ## InstructBlipVisionModel [[autodoc]] InstructBlipVisionModel diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md new file mode 100644 index 00000000000000..97ba23fc29cf7d --- /dev/null +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -0,0 +1,59 @@ + + +# InstructBlipVideo + +## Overview + +The InstructBlipVideo model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## InstructBlipVideoConfig + +[[autodoc]] InstructBlipVideoConfig + - from_vision_qformer_text_configs + +## InstructBlipVideoVisionConfig + +[[autodoc]] InstructBlipVideoVisionConfig + +## InstructBlipVideoQFormerConfig + +[[autodoc]] InstructBlipVideoQFormerConfig + +## InstructBlipVideoVisionModel + +[[autodoc]] InstructBlipVideoVisionModel + - forward + +## InstructBlipVideoQFormerModel + +[[autodoc]] InstructBlipVideoQFormerModel + - forward + +## InstructBlipVideoForConditionalGeneration + +[[autodoc]] InstructBlipVideoForConditionalGeneration + - forward + - generate \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 973764da0b7a55..4a54ae3a487894 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -473,6 +473,12 @@ "InstructBlipQFormerConfig", "InstructBlipVisionConfig", ], + "models.instructblipvideo": [ + "InstructBlipVideoConfig", + "InstructBlipVideoProcessor", + "InstructBlipVideoQFormerConfig", + "InstructBlipVideoVisionConfig", + ], "models.jamba": ["JambaConfig"], "models.jetmoe": ["JetMoeConfig"], "models.kosmos2": [ @@ -1135,6 +1141,7 @@ _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) + _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"]) _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"]) @@ -2303,6 +2310,14 @@ "InstructBlipVisionModel", ] ) + _import_structure["models.instructblipvideo"].extend( + [ + "InstructBlipVideoForConditionalGeneration", + "InstructBlipVideoPreTrainedModel", + "InstructBlipVideoQFormerModel", + "InstructBlipVideoVisionModel", + ] + ) _import_structure["models.jamba"].extend( [ "JambaForCausalLM", @@ -5055,6 +5070,12 @@ InstructBlipQFormerConfig, InstructBlipVisionConfig, ) + from .models.instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoProcessor, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + ) from .models.jamba import JambaConfig from .models.jetmoe import JetMoeConfig from .models.kosmos2 import ( @@ -5743,6 +5764,7 @@ from .models.idefics import IdeficsImageProcessor from .models.idefics2 import Idefics2ImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor + from .models.instructblipvideo import InstructBlipVideoImageProcessor from .models.layoutlmv2 import ( LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor, @@ -6732,6 +6754,12 @@ InstructBlipQFormerModel, InstructBlipVisionModel, ) + from .models.instructblipvideo import ( + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoPreTrainedModel, + InstructBlipVideoQFormerModel, + InstructBlipVideoVisionModel, + ) from .models.jamba import ( JambaForCausalLM, JambaForSequenceClassification, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 24b602f18c8f38..ade8ac703d9479 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -111,6 +111,7 @@ imagegpt, informer, instructblip, + instructblipvideo, jamba, jetmoe, kosmos2, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 40e282166ef99e..efca5cc8187a37 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -128,6 +128,7 @@ ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), ("instructblip", "InstructBlipConfig"), + ("instructblipvideo", "InstructBlipVideoConfig"), ("jamba", "JambaConfig"), ("jetmoe", "JetMoeConfig"), ("jukebox", "JukeboxConfig"), @@ -402,6 +403,7 @@ ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), + ("instructblipvideo", "InstructBlipVideoVideo"), ("jamba", "Jamba"), ("jetmoe", "JetMoe"), ("jukebox", "Jukebox"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index eb21b58e20f14e..1ef9e84001f14a 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -75,6 +75,7 @@ ("idefics2", "Idefics2ImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), + ("instructblipvideo", "InstructBlipVideoImageProcessor"), ("kosmos-2", "CLIPImageProcessor"), ("layoutlmv2", "LayoutLMv2ImageProcessor"), ("layoutlmv3", "LayoutLMv3ImageProcessor"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 4a8295cc830419..dbe244ca17768b 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -64,6 +64,7 @@ ("idefics", "IdeficsProcessor"), ("idefics2", "Idefics2Processor"), ("instructblip", "InstructBlipProcessor"), + ("instructblipvideo", "InstructBlipVideoProcessor"), ("kosmos-2", "Kosmos2Processor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index e99bc89205cbdf..267e08ea2588cf 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -205,6 +205,7 @@ ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ( "jamba", ( diff --git a/src/transformers/models/instructblipvideo/__init__.py b/src/transformers/models/instructblipvideo/__init__.py new file mode 100644 index 00000000000000..18d20d0401501a --- /dev/null +++ b/src/transformers/models/instructblipvideo/__init__.py @@ -0,0 +1,83 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_instructblipvideo": [ + "InstructBlipVideoConfig", + "InstructBlipVideoQFormerConfig", + "InstructBlipVideoVisionConfig", + ], + "processing_instructblipvideo": ["InstructBlipVideoProcessor"], +} + + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_instructblipvideo"] = ["InstructBlipVideoImageProcessor"] + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_instructblipvideo"] = [ + "InstructBlipVideoQFormerModel", + "InstructBlipVideoPreTrainedModel", + "InstructBlipVideoForConditionalGeneration", + "InstructBlipVideoVisionModel", + ] + +if TYPE_CHECKING: + from .configuration_instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + ) + from .processing_instructblipvideo import InstructBlipVideoProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_instructblipvideo import InstructBlipVideoImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_instructblipvideo import ( + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoPreTrainedModel, + InstructBlipVideoQFormerModel, + InstructBlipVideoVisionModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py new file mode 100644 index 00000000000000..a57f5ff1706b02 --- /dev/null +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -0,0 +1,362 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class InstructBlipVideoVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InstructBlipVideoVisionModel`]. It is used to + instantiate a Instructblipvideo vision encoder according to the specified arguments, defining the model architecture. + Instantiating a configuration defaults will yield a similar configuration to that of the Instructblipvideo + [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer + normalization layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + + Example: + + ```python + >>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel + + >>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoVisionConfig() + + >>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "instructblipvideo_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=1e-6, + attention_dropout=0.0, + initializer_range=1e-10, + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from InstructBlipVideoConfig + if config_dict.get("model_type") == "instructblipvideo": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class InstructBlipVideoQFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InstructBlipVideoQFormerModel`]. It is used to + instantiate a Instructblipvideo Querying Transformer (Q-Former) model according to the specified arguments, defining the + model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the Instructblipvideo [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) + architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. + Read the documentation from [`PretrainedConfig`] for more information. + + Note that [`InstructBlipVideoQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (``, *optional*, defaults to 0): + Token id used for padding sequences. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + + Examples: + + ```python + >>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel + + >>> # Initializing a Instructblipvideo Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoQFormerConfig() + + >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoQFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "instructblipvideo_qformer" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from InstructBlipVideoConfig + if config_dict.get("model_type") == "instructblipvideo": + config_dict = config_dict["qformer_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class InstructBlipVideoConfig(PretrainedConfig): + r""" + [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a + [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified + arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with + the defaults will yield a similar configuration to that of the Instructblipvideo + [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ( + ... InstructBlipVideoVisionConfig, + ... InstructBlipVideoQFormerConfig, + ... OPTConfig, + ... InstructBlipVideoConfig, + ... InstructBlipVideoForConditionalGeneration, + ... ) + + >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoConfig() + + >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig + + >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations + >>> vision_config = InstructBlipVideoVisionConfig() + >>> qformer_config = InstructBlipVideoQFormerConfig() + >>> text_config = OPTConfig() + + >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "instructblipvideo" + + def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + + self.vision_config = InstructBlipVideoVisionConfig(**vision_config) + self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self.tie_word_embeddings = self.text_config.tie_word_embeddings + self.is_encoder_decoder = self.text_config.is_encoder_decoder + + self.num_query_tokens = num_query_tokens + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: InstructBlipVideoVisionConfig, + qformer_config: InstructBlipVideoQFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a Instructblipvideo vision model, Q-Former and + language model configurations. + + Returns: + [`InstructBlipVideoConfig`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py new file mode 100644 index 00000000000000..9b3d508db6ffe6 --- /dev/null +++ b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py @@ -0,0 +1,305 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Convert InstructBlipVideo checkpoints from the original repository. + +URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo +""" + +import argparse + +import requests +import torch + +# pip3 install salesforce-lavis +# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) +# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +# same for Vicuna-13b +from lavis.models import load_model_and_preprocess +from PIL import Image + +from transformers import ( + AutoTokenizer, + BlipImageProcessor, + InstructBlipProcessor, + InstructBlipVideoConfig, + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + LlamaConfig, + LlamaTokenizerFast, + T5Config, + T5TokenizerFast, +) +from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD + + +def load_demo_image(): + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + return image + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config): + rename_keys = [] + # fmt: off + + # vision encoder + rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) + rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) + rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) + rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) + rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) + rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) + + for i in range(config.vision_config.num_hidden_layers): + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) + + # QFormer + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +def read_in_q_v_bias(state_dict, config): + for i in range(config.vision_config.num_hidden_layers): + # read in original q and v biases + q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") + v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") + + # next, set bias in the state dict + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias + + +def get_blip2_config(model_name): + image_size = 364 if "coco" in model_name else 224 + vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict() + + # make sure the models have proper bos_token_id and eos_token_id set (important for generation) + # seems like flan-T5 models don't have bos_token_id properly set? + if "t5-xl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "t5-xxl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "vicuna-7b" in model_name: + text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() + elif "vicuna-13b" in model_name: + text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() + else: + raise ValueError("Model name not supported") + + # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 + qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict() + config = InstructBlipVideoConfig( + vision_config=vision_config, text_config=text_config, qformer_config=qformer_config + ) + + return config, image_size + + +@torch.no_grad() +def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): + """ + Copy/paste/tweak model's weights to Transformers design. + """ + qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") + qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + + if "t5" in model_name: + tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") + elif "vicuna" in model_name: + # the following was used in the original implementation: + # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") + # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + # tokenizer.add_special_tokens({"bos_token": ""}) + # tokenizer.add_special_tokens({"eos_token": ""}) + # tokenizer.add_special_tokens({"unk_token": ""}) + tokenizer = LlamaTokenizerFast.from_pretrained( + "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" + ) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + config, image_size = get_blip2_config(model_name) + hf_model = InstructBlipVideoForConditionalGeneration(config).eval() + + model_name_to_original = { + "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), + "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), + "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), + "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), + } + + name, type = model_name_to_original[model_name] + + # load original model + print("Loading original model...") + hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" + lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" + original_model, vis_processors, _ = load_model_and_preprocess( + name=name, model_type=type, is_eval=True, device=lavis_device + ) + original_model.eval() + print("Done!") + + # update state dict keys + state_dict = original_model.state_dict() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + + # some keys can be renamed efficiently + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if key.startswith("Qformer.bert"): + key = key.replace("Qformer.bert", "qformer") + if "attention.self" in key: + key = key.replace("self", "attention") + if "llm_proj" in key: + key = key.replace("llm_proj", "language_projection") + if "t5_proj" in key: + key = key.replace("t5_proj", "language_projection") + if key.startswith("llm_model"): + key = key.replace("llm_model", "language_model") + if key.startswith("t5"): + key = key.replace("t5", "language") + state_dict[key] = val + + # read in qv biases + read_in_q_v_bias(state_dict, config) + + # note: weights get loaded in torch.float32 by default + hf_model.load_state_dict(state_dict, strict=True) + + image = load_demo_image() + prompt = "What is unusual about this image?" + + # create processor + image_processor = BlipImageProcessor( + size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD + ) + processor = InstructBlipProcessor( + image_processor=image_processor, + tokenizer=tokenizer, + qformer_tokenizer=qformer_tokenizer, + ) + inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) + + # make sure processor creates exact same pixel values + original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) + pixel_values = inputs.pixel_values + assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) + + original_model.to(lavis_device) + hf_model.to(hf_model_device) + with torch.no_grad(): + if "vicuna" in model_name: + original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits + logits = hf_model(**inputs).logits + else: + original_logits = original_model( + {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} + ).logits + label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) + labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) + logits = hf_model(**inputs, labels=labels).logits + + print("First values of original logits:", original_logits[0, :3, :3]) + print("First values of HF logits:", logits[0, :3, :3]) + + # assert values + assert original_logits.shape == logits.shape + atol = 1e-4 if "vicuna" in model_name else 1e-5 + assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) + print("Looks ok!") + + print("Generating with original model...") + original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) + + # important: we need to cast the weights of the HF model to the appropriate type + print("Generating with HF model...") + outputs = hf_model.generate( + **inputs, + do_sample=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1.0, + temperature=1, + ) + if "vicuna" in model_name: + # convert output id 0 to 2 (eos_token_id) + # TODO add this in the generate method? + outputs[outputs == 0] = 2 + print("Original generation:", original_outputs) + output_text = processor.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + print("HF generation:", output_text) + + if pytorch_dump_folder_path is not None: + processor.save_pretrained(pytorch_dump_folder_path) + hf_model.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + processor.push_to_hub(f"Salesforce/{model_name}") + hf_model.push_to_hub(f"Salesforce/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + choices = [ + "instructblipvideo-vicuna-7b", + "instructblipvideo-vicuna-13b", + "instructblipvideo-flan-t5-xl", + "instructblipvideo-flan-t5-xxl", + ] + parser.add_argument( + "--model_name", + default="instructblipvideo-flan-t5-xl", + choices=choices, + type=str, + help="Path to hf config.json of model to convert", + ) + parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model and processor to the hub after converting", + ) + + args = parser.parse_args() + + convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py new file mode 100644 index 00000000000000..68fc934d519099 --- /dev/null +++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from transformers.models.blip.image_processing_blip import BlipImageProcessor +from transformers.models.instructblip.configuration_instructblip import InstructBlipConfig, InstructBlipVisionConfig, InstructBlipQFormerConfig +from transformers.models.instructblip.modeling_instructblip import ( + InstructBlipAttention, + InstructBlipEncoder, + InstructBlipEncoderLayer, + InstructBlipForConditionalGeneration, + InstructBlipForConditionalGenerationModelOutput, + InstructBlipMLP, + InstructBlipPreTrainedModel, + InstructBlipQFormerAttention, + InstructBlipQFormerEmbeddings, + InstructBlipQFormerEncoder, + InstructBlipQFormerIntermediate, + InstructBlipQFormerLayer, + InstructBlipQFormerModel, + InstructBlipQFormerOutput, + InstructBlipQFormerSelfOutput, + InstructBlipVisionEmbeddings, + InstructBlipVisionModel, +) + +from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING +from ...utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +class InstructBlipVideoVisionConfig(InstructBlipVisionConfig): + pass + +class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig): + pass + +class InstructBlipVideoConfig(InstructBlipConfig): + pass + + +@dataclass +class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForConditionalGenerationModelOutput): + pass + + +class InstructBlipVideoVisionEmbeddings(InstructBlipVisionEmbeddings): + pass + +class InstructBlipVideoAttention(InstructBlipAttention): + pass + +class InstructBlipVideoMLP(InstructBlipMLP): + pass + +class InstructBlipVideoEncoderLayer(InstructBlipEncoderLayer): + pass + +class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): + pass + +class InstructBlipVideoEncoder(InstructBlipEncoder): + pass + +class InstructBlipVideoVisionModel(InstructBlipVisionModel): + pass + +class InstructBlipVideoQFormerSelfOutput(InstructBlipQFormerSelfOutput): + pass + +class InstructBlipVideoQFormerAttention(InstructBlipQFormerAttention): + pass + +class InstructBlipVideoQFormerIntermediate(InstructBlipQFormerIntermediate): + pass + +class InstructBlipVideoQFormerOutput(InstructBlipQFormerOutput): + pass + +class InstructBlipVideoQFormerLayer(InstructBlipQFormerLayer): + pass + +class InstructBlipVideoQFormerEncoder(InstructBlipQFormerEncoder): + pass + +class InstructBlipVideoQFormerEmbeddings(InstructBlipQFormerEmbeddings): + pass + +class InstructBlipVideoQFormerModel(InstructBlipQFormerModel): + pass + + +class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration): + def forward( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: torch.FloatTensor, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, InstructBlipVideoForConditionalGenerationModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size - + 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size]` + + Returns: + + Examples: + + ```python + >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration + >>> import torch + >>> from huggingface_hub import hf_hub_download + >>> from decord import VideoReader + + >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") + >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") + + >>> file_path = hf_hub_download( + repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ) + + >>> vr = VideoReader(uri=file_path, height=224, width=224) + >>> start, end = 0, len(vr) + >>> indices = np.arange(start, end, vlen / 4).astype(int) + >>> frames = vr.get_batch(indices).asnumpy() + >>> video = list(frames) + >>> prompt = "What is happening in the video?" + >>> inputs = processor(videos=[video], text=prompt, return_tensors="pt").to(device) + + >>> outputs = model.generate( + ... **inputs, + ... do_sample=False, + ... num_beams=5, + ... max_length=256, + ... min_length=1, + ... top_p=0.9, + ... repetition_penalty=1.5, + ... length_penalty=1.0, + ... temperature=1, + ... ) + >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front" + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # we process in a batched way, later unbatch it back (video has frames=4 always) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0][:, : query_tokens.size(1), :] + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + + # unbatch inputs back, each video-frame gets `num_query_tokens` seq length + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return InstructBlipVideoForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: Optional[torch.LongTensor] = None, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or + (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed. + qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt to be fed to the Q-Former module. + qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the positional encoding of the image embeddings. + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + + # we process in a batched way, later unbatch it back (video has frames=4) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + image_embeds = self.vision_model( + pixel_values, + return_dict=True, + interpolate_pos_encoding=interpolate_pos_encoding, + ).last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :] + + language_model_inputs = self.language_projection(query_output) + + # unbatch the embeddings back by moving frames to seq-len + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + if input_ids is None: + input_ids = ( + torch.LongTensor([[self.config.text_config.bos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1) + + # concatenate query embeddings with prompt embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + if not self.language_model.config.is_encoder_decoder: + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 + generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + # the InstructBLIP authors used inconsistent tokenizer/model files during training, + # with the tokenizer's bos token being set to which has ID=2, + # whereas the model's text config has bos token id = 0 + bos_token_id = ( + 2 + if self.config.text_config.architectures[0] == "LLaMAForCausalLM" + else self.config.text_config.bos_token_id + ) + bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) + else: + outputs = torch.cat([bos_tokens, outputs], dim=-1) + + return outputs + diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py new file mode 100644 index 00000000000000..f42abc66a4f1f6 --- /dev/null +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -0,0 +1,358 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Image processor class for InstructBLIPVideo. Largely copy of Blip2Processor with addition of a video processing abilities +""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + VideoInput, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +def make_batched_videos(videos) -> List[VideoInput]: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]) and len(videos[0].shape) == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + + +# Copied from transformers.models.blip.image_processing_blip.BlipImageProcessor with Blip->InstructBlipVideo, BLIP->InstructBLIPVideo +class InstructBlipVideoImageProcessor(BaseImageProcessor): + r""" + Constructs a InstructBLIPVideo image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Ignore copy + def preprocess( + self, + images: VideoInput = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess a video or batch of images/videos. + + Args: + videos (`VideoInput`): + Video frames to preprocess. Expects a single or batch of videos as a list of frames with pixel values + ranging from 0 to 255. If passing in video with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the video. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the video after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the video. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the video values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the video by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the video. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the video by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the video by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + videos = make_batched_videos(images) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if not valid_images(videos): + raise ValueError( + "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + pixel_values = [ + [ + self._preprocess_image( + image=frame, + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + data_format=data_format, + input_data_format=input_data_format, + ) + for frame in video + ] + for video in videos + ] + + encoded_outputs = BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) + return encoded_outputs + + # Ignore copy + def _preprocess_image( + self, + image: ImageInput = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + # PIL RGBA images are converted to RGB + if do_convert_rgb: + image = convert_to_rgb(image) + + # All transformations expect numpy arrays. + image = to_numpy_array(image) + + if is_scaled_image(image) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled video frames. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + return image diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py new file mode 100644 index 00000000000000..08a3a1b03cbd1a --- /dev/null +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -0,0 +1,1648 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM +from .configuration_instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, +) + + +logger = logging.get_logger(__name__) + + +@dataclass +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlipVideo +class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`]. + + Args: + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + vision_outputs: Optional[torch.FloatTensor] = None + qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None + language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlipVideo +class InstructBlipVideoVisionEmbeddings(nn.Module): + def __init__(self, config: InstructBlipVideoVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + if num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, 0, :] + patch_pos_embed = self.position_embedding[:, 1:, :] + dim = embeddings.shape[-1] + h0 = height // self.config.patch_size + w0 = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + h0, w0 = h0 + 0.1, w0 + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlipVideo +class InstructBlipVideoAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False) + + if config.qkv_bias: + q_bias = nn.Parameter(torch.zeros(self.embed_dim)) + v_bias = nn.Parameter(torch.zeros(self.embed_dim)) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + self.qkv.bias = nn.Parameter(qkv_bias) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute( + 2, 0, 3, 1, 4 + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +# Copied from transformers.models.blip.modeling_blip.BlipMLP +class InstructBlipVideoMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlipVideo +class InstructBlipVideoEncoderLayer(nn.Module): + def __init__(self, config: InstructBlipVideoConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = InstructBlipVideoAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = InstructBlipVideoMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class InstructBlipVideoPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = InstructBlipVideoConfig + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _no_split_modules = [ + "InstructBlipVideoQFormerEmbeddings", + "InstructBlipVideoAttention", + "InstructBlipVideoQFormerMultiHeadAttention", + "InstructBlipVideoQFormerSelfOutput", + ] + _keep_in_fp32_modules = [] + + # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlipVideo + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, InstructBlipVideoVisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) + nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlipVideo +class InstructBlipVideoEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`InstructBlipVideoEncoderLayer`]. + + Args: + config (`InstructBlipVideoConfig`): + The corresponding vision configuration for the `InstructBlipVideoEncoder`. + """ + + def __init__(self, config: InstructBlipVideoConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([InstructBlipVideoEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +INSTRUCTBLIPVIDEO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + + qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided + to serve as text prompt, which the Q-Former model will encode. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an + encoder-decoder language model (like T5) is used. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + Only relevant in case an encoder-decoder language model (like T5) is used. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlipVideo, BLIP->INSTRUCTBLIPVIDEO +class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel): + main_input_name = "pixel_values" + config_class = InstructBlipVideoVisionConfig + + def __init__(self, config: InstructBlipVideoVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = InstructBlipVideoVisionEmbeddings(config) + self.encoder = InstructBlipVideoEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @add_start_docstrings_to_model_forward(INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=InstructBlipVideoVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class InstructBlipVideoQFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores_dtype = attention_scores.dtype + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlipVideo +class InstructBlipVideoQFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = InstructBlipVideoQFormerMultiHeadAttention(config, is_cross_attention) + self.output = InstructBlipVideoQFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class InstructBlipVideoQFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = InstructBlipVideoQFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = InstructBlipVideoQFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = InstructBlipVideoQFormerIntermediate(config) + self.output = InstructBlipVideoQFormerOutput(config) + + self.intermediate_query = InstructBlipVideoQFormerIntermediate(config) + self.output_query = InstructBlipVideoQFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlipVideo +class InstructBlipVideoQFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [InstructBlipVideoQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class InstructBlipVideoQFormerEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = embeddings.to(self.layernorm.weight.dtype) + embeddings = self.layernorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel): + """ + Querying Transformer (Q-Former), used in Instructblipvideo. Slightly modified from BLIP-2 as it also takes the + instruction as input. + """ + + def __init__(self, config: InstructBlipVideoQFormerConfig): + super().__init__(config) + self.config = config + + self.embeddings = InstructBlipVideoQFormerEmbeddings(config) + + self.encoder = InstructBlipVideoQFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device: (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})", + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + query_embeds: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and query_embeds is None: + raise ValueError("You have to specify query_embeds when input_ids is None") + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Instructblipvideo Model for generating text given an image and an optional text prompt. The model consists of a vision + encoder, Querying Transformer (Q-Former) and a language model. + + One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue + the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token. + """, + INSTRUCTBLIPVIDEO_START_DOCSTRING, +) +class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel): + config_class = InstructBlipVideoConfig + main_input_name = "pixel_values" + + def __init__(self, config: InstructBlipVideoConfig): + super().__init__(config) + + self.vision_model = InstructBlipVideoVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = InstructBlipVideoQFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + + if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) + else: + language_model = AutoModelForSeq2SeqLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) + + if language_model._no_split_modules is not None: + self._no_split_modules.extend(language_model._no_split_modules) + + if language_model._keep_in_fp32_modules is not None: + self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules) + + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def _preprocess_accelerate(self): + r""" + Some pre-processing hacks to make the model `accelerate` compatible. Check + https://github.com/huggingface/transformers/pull/21707 for more details. + """ + hf_device_map = self.hf_device_map + + if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1: + # warn users about unexpected behavior when using multi-GPU + Instructblipvideo + `accelerate`. + logger.warning( + "The `language_model` is not in the `hf_device_map` dictionary and you are running your script" + " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`." + " Please pass a `device_map` that contains `language_model` to remove this warning." + " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for" + " more details on creating a `device_map` for large models.", + ) + + if hasattr(self.language_model, "_hf_hook"): + self.language_model._hf_hook.io_same_device = True # For `generate` compatibility + + @add_start_docstrings_to_model_forward(INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=InstructBlipVideoForConditionalGenerationModelOutput, config_class=InstructBlipVideoVisionConfig + ) + def forward( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: torch.FloatTensor, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, InstructBlipVideoForConditionalGenerationModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size - + 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size]` + + Returns: + + Examples: + + ```python + >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration + >>> import torch + >>> from huggingface_hub import hf_hub_download + >>> from decord import VideoReader + + >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") + >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") + + >>> file_path = hf_hub_download( + repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ) + + >>> vr = VideoReader(uri=file_path, height=224, width=224) + >>> start, end = 0, len(vr) + >>> indices = np.arange(start, end, vlen / 4).astype(int) + >>> frames = vr.get_batch(indices).asnumpy() + >>> video = list(frames) + >>> prompt = "What is happening in the video?" + >>> inputs = processor(videos=[video], text=prompt, return_tensors="pt").to(device) + + >>> outputs = model.generate( + ... **inputs, + ... do_sample=False, + ... num_beams=5, + ... max_length=256, + ... min_length=1, + ... top_p=0.9, + ... repetition_penalty=1.5, + ... length_penalty=1.0, + ... temperature=1, + ... ) + >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front" + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # we process in a batched way, later unbatch it back (video has frames=4 always) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0][:, : query_tokens.size(1), :] + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + + # unbatch inputs back, each video-frame gets `num_query_tokens` seq length + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return InstructBlipVideoForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: Optional[torch.LongTensor] = None, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or + (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed. + qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt to be fed to the Q-Former module. + qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the positional encoding of the image embeddings. + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + + # we process in a batched way, later unbatch it back (video has frames=4) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + image_embeds = self.vision_model( + pixel_values, + return_dict=True, + interpolate_pos_encoding=interpolate_pos_encoding, + ).last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :] + + language_model_inputs = self.language_projection(query_output) + + # unbatch the embeddings back by moving frames to seq-len + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + if input_ids is None: + input_ids = ( + torch.LongTensor([[self.config.text_config.bos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1) + + # concatenate query embeddings with prompt embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + if not self.language_model.config.is_encoder_decoder: + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 + generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + # the InstructBLIP authors used inconsistent tokenizer/model files during training, + # with the tokenizer's bos token being set to which has ID=2, + # whereas the model's text config has bos token id = 0 + bos_token_id = ( + 2 + if self.config.text_config.architectures[0] == "LLaMAForCausalLM" + else self.config.text_config.bos_token_id + ) + bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) + else: + outputs = torch.cat([bos_tokens, outputs], dim=-1) + + return outputs diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py new file mode 100644 index 00000000000000..0957158077516d --- /dev/null +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former. +""" + +import os +from typing import List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import VideoInput +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType +from ..auto import AutoTokenizer + + +class InstructBlipVideoProcessor(ProcessorMixin): + r""" + Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single + processor. + + [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the + docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information. + + Args: + image_processor (`InstructBlipVideoImageProcessor`): + An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + qformer_tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "InstructBlipVideoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer, qformer_tokenizer): + super().__init__(image_processor, tokenizer) + + # add QFormer tokenizer + self.qformer_tokenizer = qformer_tokenizer + + def __call__( + self, + images: VideoInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_token_type_ids: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + encoding = BatchFeature() + + if text is not None: + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + encoding.update(text_encoding) + qformer_text_encoding = self.qformer_tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") + encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") + + if images is not None: + image_encoding = self.image_processor(images, return_tensors=return_tensors) + encoding.update(image_encoding) + + return encoding + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + # overwrite to save the Q-Former tokenizer in a separate folder + def save_pretrained(self, save_directory, **kwargs): + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + os.makedirs(save_directory, exist_ok=True) + qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer") + self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path) + return super().save_pretrained(save_directory, **kwargs) + + # overwrite to load the Q-Former tokenizer from a separate folder + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + args.append(qformer_tokenizer) + return cls(*args) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 0cda4ed7b96349..11a16562263d45 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4755,56 +4755,56 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaForCausalLM(metaclass=DummyObject): +class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaForSequenceClassification(metaclass=DummyObject): +class InstructBlipVideoPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaModel(metaclass=DummyObject): +class InstructBlipVideoQFormerModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaPreTrainedModel(metaclass=DummyObject): +class InstructBlipVideoVisionModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoeForCausalLM(metaclass=DummyObject): +class JambaForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoeForSequenceClassification(metaclass=DummyObject): +class JambaForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoeModel(metaclass=DummyObject): +class JambaModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoePreTrainedModel(metaclass=DummyObject): +class JambaPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index aae31e9e4dd7f4..329831667c6303 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -296,6 +296,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class InstructBlipVideoImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class LayoutLMv2FeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/instructblipvideo/__init__.py b/tests/models/instructblipvideo/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py new file mode 100644 index 00000000000000..d55c5d5d30ff32 --- /dev/null +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -0,0 +1,645 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch InstructBlipVideo model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +import requests + +from transformers import ( + CONFIG_MAPPING, + InstructBlipProcessor, + InstructBlipVideoConfig, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, +) +from transformers.testing_utils import ( + require_accelerate, + require_bitsandbytes, + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import is_torch_available, is_vision_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel + + +if is_vision_available(): + from PIL import Image + + +class InstructBlipVideoVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return InstructBlipVideoVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = InstructBlipVideoVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as InstructBlipVideo's vision encoder does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (InstructBlipVideoVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = InstructBlipVideoVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="InstructBlipVideo's vision encoder does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training(self): + pass + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_to_base(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce" + model = InstructBlipVideoVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class InstructBlipVideoQFormerModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask + + def get_config(self): + return InstructBlipVideoQFormerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + ) + + +# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py +class InstructBlipVideoTextModelDecoderOnlyTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.embed_dim = embed_dim + self.num_labels = num_labels + self.type_sequence_label_size = type_sequence_label_size + self.word_embed_proj_dim = word_embed_proj_dim + self.is_encoder_decoder = False + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3) + input_ids[:, -1] = self.eos_token_id # Eos Token + + attention_mask = input_ids.ne(self.pad_token_id) + + return config, input_ids, attention_mask + + def get_config(self): + return CONFIG_MAPPING["opt"]( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + embed_dim=self.embed_dim, + is_encoder_decoder=False, + word_embed_proj_dim=self.word_embed_proj_dim, + ) + + +# this model tester uses a decoder-only language model (OPT) +class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: + def __init__( + self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + ): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {} + if text_kwargs is None: + text_kwargs = {} + + self.parent = parent + self.vision_model_tester = InstructBlipVideoVisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) + self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.is_training = is_training + self.num_query_tokens = num_query_tokens + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + + def get_config(self): + return InstructBlipVideoConfig.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + num_query_tokens=self.num_query_tokens, + ) + + def create_and_check_for_conditional_generation( + self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + ): + model = InstructBlipVideoForConditionalGeneration(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + qformer_input_ids=qformer_input_ids, + qformer_attention_mask=qformer_attention_mask, + ) + + expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length + self.parent.assertEqual( + result.logits.shape, + (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "qformer_input_ids": qformer_input_ids, + "qformer_attention_mask": qformer_attention_mask, + "labels": input_ids, + } + return config, inputs_dict + + +@require_torch +class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( + ModelTesterMixin, GenerationTesterMixin, unittest.TestCase +): + all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_torchscript = False + + def setUp(self): + self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + + def test_for_conditional_generation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="InstructBlipVideoForConditionalGeneration doesn't support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Tied weights are tested in individual model tests") + def test_tied_weights_keys(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="InstructBlipVideoModel does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="There's no base InstructBlipVideoModel") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="There's no base InstructBlipVideoModel") + def test_save_load_fast_init_to_base(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = InstructBlipVideoVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoQFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = InstructBlipVideoQFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce" + model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + +@require_vision +@require_torch +@slow +class InstructBlipVideoModelIntegrationTest(unittest.TestCase): + @require_bitsandbytes + @require_accelerate + def test_inference_vicuna_7b(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblipvideo-vicuna-7b") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce/instructblipvideo-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + ) + + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + prompt = "What is unusual about this image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + # verify logits + with torch.no_grad(): + logits = model(**inputs).logits + + expected_slice = torch.tensor( + [[-3.3926, -12.2969, 8.4922], [-5.0195, -11.9531, 8.1406], [-4.0039, -13.3594, 9.2578]], + device=torch_device, + ) + + self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3)) + + # verify generation + outputs = model.generate(**inputs, max_new_tokens=30) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + + expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272] # fmt: off + + self.assertEqual(outputs[0].tolist(), expected_outputs) + self.assertEqual( + generated_text, + "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city", + ) + + def test_inference_flant5_xl(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ).to(torch_device) + + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + prompt = "What is unusual about this image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) + + for k, v in inputs.items(): + if torch.is_floating_point(v): + inputs[k] = v.to(torch.bfloat16) + + outputs = model.generate( + **inputs, + do_sample=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1.0, + temperature=1, + ) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] + + expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1] # fmt: skip + + expected_outputs = [0, 37, 7225, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 46, 3575, 53, 1476, 5223, 12, 34, 6, 15495, 24, 3, 88, 19, 692, 112, 293, 10428, 44, 234, 1066, 145, 338, 3, 9, 50, 1106, 3522, 144, 42, 2192, 7919, 31, 7, 5, 37, 1023, 92, 1267, 3, 9, 381, 13, 119, 3203, 16, 8, 2458, 6, 379, 14264, 6, 9256, 7, 6, 11, 11718, 7, 5, 1] # fmt: skip + + self.assertEqual(outputs[0].tolist(), expected_outputs) + self.assertEqual( + generated_text, + "The unusual image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with an ironing board attached to it, suggesting that he is doing his own laundry at home rather than using a laundromat or dry cleaner's. The image also shows a number of other vehicles in the background, including buses, taxis, and motorcycles.", + ) + + def test_inference_interpolate_pos_encoding(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ).to(torch_device) + processor.image_processor.size = {"height": 500, "width": 500} + + image = prepare_img() + prompt = "What's in the image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) + + predictions = model.generate(**inputs, interpolate_pos_encoding=True) + generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() + + self.assertEqual( + predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1] + ) + self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.") diff --git a/utils/diff_model_converter.py b/utils/diff_model_converter.py index b37ad74cb1b034..3317b0acdde23c 100644 --- a/utils/diff_model_converter.py +++ b/utils/diff_model_converter.py @@ -173,7 +173,7 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer): - LLaMa -> MyNewModel abd MyNewModel -> Llama """ - def __init__(self, old_name, new_name): + def __init__(self, old_name, new_name, given_old_name=None, given_new_name=None): super().__init__() self.old_name = old_name self.new_name = new_name @@ -183,6 +183,8 @@ def __init__(self, old_name, new_name): old_name.upper(): new_name.upper(), "".join(x.title() for x in old_name.split("_")): self.default_name, } + if given_old_name is not None and given_new_name is not None and given_old_name not in self.patterns: + self.patterns[given_old_name] = given_new_name def preserve_case_replace(self, text): # Create a regex pattern to match all variations @@ -201,9 +203,9 @@ def replace_name(self, original_node, updated_node): return updated_node.with_changes(value=update) -def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma"): +def find_classes_in_file(module: cst.Module, old_id="llama", new_id="gemma", given_old_name=None, given_new_name=None): """Helper function to rename and then parse a source file using the ClassFinder""" - transformer = ReplaceNameTransformer(old_id, new_id) + transformer = ReplaceNameTransformer(old_id, new_id, given_old_name, given_new_name) new_module = module.visit(transformer) wrapper = MetadataWrapper(new_module) @@ -356,11 +358,13 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef, class DiffConverterTransformer(CSTTransformer): METADATA_DEPENDENCIES = (ParentNodeProvider, ScopeProvider, PositionProvider) - def __init__(self, python_module, new_name): + def __init__(self, python_module, new_name, given_old_name=None, given_new_name=None): super().__init__() self.model_name = ( new_name # name of the model being defined. Should be in the format of `llama` or `layout_xlm` our `phi3` ) + self.given_old_name = given_old_name + self.given_new_name = given_new_name # fmt: off self.python_module = python_module # we store the original module to use `code_for_node` self.transformers_imports = {} # maps the imports name like "from transformers.models.xxx" to the parsed AST module @@ -426,6 +430,7 @@ def leave_SimpleStatementLine(self, original_node, updated_node): "insert_idx": self.global_scope_index, "node": updated_node, } + self.config_body = [updated_node] return updated_node def leave_ClassDef(self, original_node, updated_node): @@ -459,7 +464,11 @@ def leave_ClassDef(self, original_node, updated_node): if super_file_name not in self.visited_module: # only extract classes once class_finder = find_classes_in_file( - self.transformers_imports[super_file_name], model_name, self.model_name + self.transformers_imports[super_file_name], + model_name, + self.model_name, + self.given_old_name, + self.given_new_name, ) self.visited_module[super_file_name] = class_finder else: # we are re-using the previously parsed data @@ -474,7 +483,7 @@ def leave_ClassDef(self, original_node, updated_node): start_insert_idx = self.global_scope_index for dependency, _ in list_dependencies: node = class_finder.global_nodes.get(dependency, None) - if node is not None: + if node is not None and "Config" not in class_name: if dependency not in self.new_body: start_insert_idx -= 1 self.new_body[dependency] = {"insert_idx": start_insert_idx, "node": node} @@ -485,7 +494,7 @@ def leave_ClassDef(self, original_node, updated_node): if len(list_dependencies) > 0: updated_node = replace_call_to_super(class_finder, updated_node, class_name) if "Config" in class_name: - self.config_body = [updated_node] + self.config_body += [updated_node] else: self.new_body[class_name] = {"insert_idx": self.global_scope_index, "node": updated_node} return updated_node @@ -516,7 +525,7 @@ def leave_Module(self, original_node: cst.Assign, node): return node.with_changes(body=[*new_body]) -def convert_file(diff_file, cst_transformers=None): +def convert_file(diff_file, old_model_name=None, new_model_name=None, cst_transformers=None): model_name = re.search(r"diff_(.*)(?=\.py$)", diff_file).groups()[0] # Parse the Python file with open(diff_file, "r") as file: @@ -524,7 +533,7 @@ def convert_file(diff_file, cst_transformers=None): module = cst.parse_module(code) wrapper = MetadataWrapper(module) if cst_transformers is None: - cst_transformers = DiffConverterTransformer(module, model_name) + cst_transformers = DiffConverterTransformer(module, model_name, old_model_name, new_model_name) new_mod = wrapper.visit(cst_transformers) ruffed_code = run_ruff(new_mod.code, True) formatted_code = run_ruff(ruffed_code, False) @@ -551,10 +560,20 @@ def convert_file(diff_file, cst_transformers=None): nargs="+", help="A list of `diff_xxxx` files that should be converted to single model file", ) + parser.add_argument( + "--old_model_name", + required=False, + help="The name of the model from which the copying is done in CamelCase. If not provided is inferred from diff-file", + ) + parser.add_argument( + "--new_model_name", + required=False, + help="The name of the new model being added in CamelCase. If not provided is inferred from diff-file", + ) args = parser.parse_args() if args.files_to_parse == ["all"]: args.files_to_parse = glob.glob("src/transformers/models/**/diff_*.py", recursive=True) for file_name in args.files_to_parse: print(f"Converting {file_name} to a single model single file format") module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "") - converter = convert_file(file_name) + converter = convert_file(file_name, args.old_model_name, args.new_model_name) From bf7de9888fcc56ef210b2019cfdfe21037cb5071 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 09:53:21 +0200 Subject: [PATCH 02/16] add docs --- docs/source/en/model_doc/instructblipvideo.md | 29 ++- .../image_processing_instructblipvideo.py | 8 +- ...test_image_processing_instrictblipvideo.py | 190 ++++++++++++++++++ .../test_modeling_instructblipvideo.py | 126 ++++-------- 4 files changed, 256 insertions(+), 97 deletions(-) create mode 100644 tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index 97ba23fc29cf7d..d28dea5c10acb5 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -14,20 +14,26 @@ specific language governing permissions and limitations under the License. ## Overview -The InstructBlipVideo model was proposed in []() by . - +## Overview + +The InstructBLIPVideo is an extension of the models proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. +InstructBLIPVideo uses the same architecture as [InstructBLIP](instructblip) and works with the same checkpoints as [InstructBLIP](instructblip). The only difference is the ability to process videos. The abstract from the paper is the following: -** +*General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.* -Tips: + - + InstructBLIPVideo architecture. Taken from the original paper. -This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). -The original code can be found [here](). +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip). +## Usage tips + +- The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames ## InstructBlipVideoConfig @@ -42,6 +48,15 @@ The original code can be found [here](). [[autodoc]] InstructBlipVideoQFormerConfig +## InstructBlipProcessor + +[[autodoc]] InstructBlipProcessor + +## InstructBlipImageProcessor + +[[autodoc]] InstructBlipImageProcessor + - preprocess + ## InstructBlipVideoVisionModel [[autodoc]] InstructBlipVideoVisionModel diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index f42abc66a4f1f6..69f2feebd39c69 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -52,8 +52,11 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]) and len(videos[0].shape) == 4: - return [list(video) for video in videos] + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] elif is_valid_image(videos) and len(videos.shape) == 4: return [list(videos)] @@ -180,6 +183,7 @@ def resize( size = get_size_dict(size) if "height" not in size or "width" not in size: raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) return resize( image, diff --git a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py new file mode 100644 index 00000000000000..df29234c7ff34d --- /dev/null +++ b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py @@ -0,0 +1,190 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import InstructBlipVideoImageProcessor + + +class InstructBlipVideoProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=5, + num_channels=3, + image_size=24, + min_resolution=30, + max_resolution=80, + do_resize=True, + size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + frames=4, + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.frames = frames + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.frames, self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + images = prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + # let's simply copy the frames to fake a long video-clip + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(self.frames, 0) + else: + video = image[None, ...].repeat(self.frames, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * self.frames) + + return videos + + +@require_torch +@require_vision +class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = InstructBlipVideoProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for video in video_inputs: + self.assertIsInstance(video, np.ndarray) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + for video in video_inputs: + self.assertIsInstance(video, torch.Tensor) + + # Test not batched input + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index d55c5d5d30ff32..6c0d93ff554917 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -19,12 +19,12 @@ import unittest import numpy as np -import requests +from huggingface_hub import hf_hub_download from transformers import ( CONFIG_MAPPING, - InstructBlipProcessor, InstructBlipVideoConfig, + InstructBlipVideoProcessor, InstructBlipVideoQFormerConfig, InstructBlipVideoVisionConfig, ) @@ -56,7 +56,7 @@ if is_vision_available(): - from PIL import Image + pass class InstructBlipVideoVisionModelTester: @@ -65,6 +65,7 @@ def __init__( parent, batch_size=12, image_size=30, + frames=4, patch_size=2, num_channels=3, is_training=True, @@ -81,6 +82,7 @@ def __init__( self.parent = parent self.batch_size = batch_size self.image_size = image_size + self.frames = frames self.patch_size = patch_size self.num_channels = num_channels self.is_training = is_training @@ -99,7 +101,9 @@ def __init__( self.seq_length = num_patches + 1 def prepare_config_and_inputs(self): - pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_values = floats_tensor( + [self.batch_size * self.frames, self.num_channels, self.image_size, self.image_size] + ) config = self.get_config() return config, pixel_values @@ -129,8 +133,10 @@ def create_and_check_model(self, config, pixel_values): image_size = (self.image_size, self.image_size) patch_size = (self.patch_size, self.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) - self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size * self.frames, num_patches + 1, self.hidden_size) + ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.frames, self.hidden_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -165,6 +171,10 @@ def test_config(self): def test_inputs_embeds(self): pass + @unittest.skip(reason="InstructBlipVideo's vision encoder is an nn.Embeddings layer") + def test_model_get_set_embeddings(self): + pass + def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -323,7 +333,7 @@ def __init__( hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=20, + max_position_embeddings=100, eos_token_id=2, pad_token_id=1, bos_token_id=0, @@ -409,6 +419,9 @@ def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + frames = self.vision_model_tester.frames + _, c, h, w = pixel_values.shape + pixel_values = pixel_values.reshape(-1, frames, c, h, w) config = self.get_config() @@ -435,7 +448,9 @@ def create_and_check_for_conditional_generation( qformer_attention_mask=qformer_attention_mask, ) - expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length + expected_seq_length = ( + self.num_query_tokens * self.vision_model_tester.frames + ) + self.text_model_tester.seq_length self.parent.assertEqual( result.logits.shape, (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size), @@ -537,109 +552,44 @@ def test_model_from_pretrained(self): # We will verify our results on an image of cute cats -def prepare_img(): - url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg" - image = Image.open(requests.get(url, stream=True).raw) - return image +def prepare_video(): + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + video = np.load(video_file)[::2] # sample every 2nd frame to get 4 frames total + return video @require_vision @require_torch +@require_bitsandbytes +@require_accelerate @slow class InstructBlipVideoModelIntegrationTest(unittest.TestCase): - @require_bitsandbytes - @require_accelerate def test_inference_vicuna_7b(self): - processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblipvideo-vicuna-7b") + processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce/instructblipvideo-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True ) - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - prompt = "What is unusual about this image?" - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + clip = prepare_video() + prompt = "Explain what is happening in this short video." + inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, torch.float16) # verify logits with torch.no_grad(): logits = model(**inputs).logits expected_slice = torch.tensor( - [[-3.3926, -12.2969, 8.4922], [-5.0195, -11.9531, 8.1406], [-4.0039, -13.3594, 9.2578]], + [[-3.3203, -11.7266, 9.7266], [-5.3242, -12.8125, 10.6328], [-3.7109, -13.2422, 10.3516]], device=torch_device, ) - self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3)) # verify generation outputs = model.generate(**inputs, max_new_tokens=30) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() - - expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272] # fmt: off - - self.assertEqual(outputs[0].tolist(), expected_outputs) self.assertEqual( generated_text, - "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city", - ) - - def test_inference_flant5_xl(self): - processor = InstructBlipProcessor.from_pretrained("Salesforce") - model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce", - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - ).to(torch_device) - - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - prompt = "What is unusual about this image?" - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) - - for k, v in inputs.items(): - if torch.is_floating_point(v): - inputs[k] = v.to(torch.bfloat16) - - outputs = model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] - - expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1] # fmt: skip - - expected_outputs = [0, 37, 7225, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 46, 3575, 53, 1476, 5223, 12, 34, 6, 15495, 24, 3, 88, 19, 692, 112, 293, 10428, 44, 234, 1066, 145, 338, 3, 9, 50, 1106, 3522, 144, 42, 2192, 7919, 31, 7, 5, 37, 1023, 92, 1267, 3, 9, 381, 13, 119, 3203, 16, 8, 2458, 6, 379, 14264, 6, 9256, 7, 6, 11, 11718, 7, 5, 1] # fmt: skip - - self.assertEqual(outputs[0].tolist(), expected_outputs) - self.assertEqual( - generated_text, - "The unusual image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with an ironing board attached to it, suggesting that he is doing his own laundry at home rather than using a laundromat or dry cleaner's. The image also shows a number of other vehicles in the background, including buses, taxis, and motorcycles.", - ) - - def test_inference_interpolate_pos_encoding(self): - processor = InstructBlipProcessor.from_pretrained("Salesforce") - model = InstructBlipVideoForConditionalGeneration.from_pretrained( - "Salesforce", - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - ).to(torch_device) - processor.image_processor.size = {"height": 500, "width": 500} - - image = prepare_img() - prompt = "What's in the image?" - inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) - - predictions = model.generate(**inputs, interpolate_pos_encoding=True) - generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() - - self.assertEqual( - predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1] + "a baby girl wearing glasses is reading a book on the bed 1080p", ) - self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.") From 6331dcb4214c063a6e109b55a081b93d4c988245 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 09:56:27 +0200 Subject: [PATCH 03/16] dummy obj --- src/transformers/utils/dummy_pt_objects.py | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 11a16562263d45..4b92799549144f 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4811,6 +4811,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class JetMoeForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JetMoeForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JetMoeModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JetMoePreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Kosmos2ForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] From 467a758f2836080a1a16ab41c9a8d96b833928ee Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 10:23:59 +0200 Subject: [PATCH 04/16] more changes in diff converter --- .../configuration_instructblip.py | 2 ++ .../configuration_instructblipvideo.py | 2 +- .../diff_instructblipvideo.py | 32 +++++++++++++------ utils/diff_model_converter.py | 13 +++++--- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index 636d18b990da4a..33b2a1a15f883d 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -164,6 +164,8 @@ class InstructBlipQFormerConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Token id used for padding sequences. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index a57f5ff1706b02..32c4a34cf7a632 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -169,7 +169,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - pad_token_id (``, *optional*, defaults to 0): + pad_token_id (`int`, *optional*, defaults to 0): Token id used for padding sequences. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py index 68fc934d519099..6891395a710ae9 100644 --- a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py @@ -13,16 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import Optional, Tuple, Union import torch import torch.utils.checkpoint from torch.nn import CrossEntropyLoss -from transformers.models.blip.image_processing_blip import BlipImageProcessor -from transformers.models.instructblip.configuration_instructblip import InstructBlipConfig, InstructBlipVisionConfig, InstructBlipQFormerConfig +from transformers.models.instructblip.configuration_instructblip import ( + InstructBlipConfig, + InstructBlipQFormerConfig, + InstructBlipVisionConfig, +) from transformers.models.instructblip.modeling_instructblip import ( InstructBlipAttention, InstructBlipEncoder, @@ -43,10 +45,7 @@ InstructBlipVisionModel, ) -from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES -from ...configuration_utils import PretrainedConfig -from ..auto import CONFIG_MAPPING -from ...utils import TensorType, logging +from ...utils import logging logger = logging.get_logger(__name__) @@ -55,9 +54,11 @@ class InstructBlipVideoVisionConfig(InstructBlipVisionConfig): pass + class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig): pass + class InstructBlipVideoConfig(InstructBlipConfig): pass @@ -70,45 +71,59 @@ class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForCondit class InstructBlipVideoVisionEmbeddings(InstructBlipVisionEmbeddings): pass + class InstructBlipVideoAttention(InstructBlipAttention): pass + class InstructBlipVideoMLP(InstructBlipMLP): pass + class InstructBlipVideoEncoderLayer(InstructBlipEncoderLayer): pass + class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): pass + class InstructBlipVideoEncoder(InstructBlipEncoder): pass + class InstructBlipVideoVisionModel(InstructBlipVisionModel): pass + class InstructBlipVideoQFormerSelfOutput(InstructBlipQFormerSelfOutput): pass + class InstructBlipVideoQFormerAttention(InstructBlipQFormerAttention): pass + class InstructBlipVideoQFormerIntermediate(InstructBlipQFormerIntermediate): pass + class InstructBlipVideoQFormerOutput(InstructBlipQFormerOutput): pass + class InstructBlipVideoQFormerLayer(InstructBlipQFormerLayer): pass + class InstructBlipVideoQFormerEncoder(InstructBlipQFormerEncoder): pass + class InstructBlipVideoQFormerEmbeddings(InstructBlipQFormerEmbeddings): pass + class InstructBlipVideoQFormerModel(InstructBlipQFormerModel): pass @@ -396,4 +411,3 @@ def generate( outputs = torch.cat([bos_tokens, outputs], dim=-1) return outputs - diff --git a/utils/diff_model_converter.py b/utils/diff_model_converter.py index 3317b0acdde23c..d8efb0dc66b321 100644 --- a/utils/diff_model_converter.py +++ b/utils/diff_model_converter.py @@ -370,6 +370,7 @@ def __init__(self, python_module, new_name, given_old_name=None, given_new_name= self.transformers_imports = {} # maps the imports name like "from transformers.models.xxx" to the parsed AST module self.imported_mapping = {} # stores the name of the imported classes, with their source {"LlamaModel":"transformers.model.llama.modeling_llama"} self.visited_module = {} # modules visited like "transformers.models.llama.modeling_llama" + self.visited_module_config = {} # modules visited like "transformers.models.llama.modeling_llama" in config file, needed to not mix config vs modeling imports self.new_body = {} # store the new body, all global scope nodes should be added here self.inserted_deps = [] # nodes inserted via super dependency self.all_imports = [] # just stores all of the imports @@ -462,7 +463,8 @@ def leave_ClassDef(self, original_node, updated_node): f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name" ) - if super_file_name not in self.visited_module: # only extract classes once + visited_module = self.visited_module_config if "Config" in class_name else self.visited_module + if super_file_name not in visited_module: # only extract classes once class_finder = find_classes_in_file( self.transformers_imports[super_file_name], model_name, @@ -470,9 +472,9 @@ def leave_ClassDef(self, original_node, updated_node): self.given_old_name, self.given_new_name, ) - self.visited_module[super_file_name] = class_finder + visited_module[super_file_name] = class_finder else: # we are re-using the previously parsed data - class_finder = self.visited_module[super_file_name] + class_finder = visited_module[super_file_name] list_dependencies = { dep: class_finder.class_start_line.get(dep, 1000) @@ -512,10 +514,13 @@ def leave_If(self, original_node, node): def leave_Module(self, original_node: cst.Assign, node): imports = {self.python_module.code_for_node(k): k for k in self.all_imports} dependency_imports = {} + config_imports = [] for visiter in self.visited_module.values(): dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()}) + for visiter in self.visited_module_config.values(): + config_imports += list(visiter.imports.values()) if hasattr(self, "config_body"): - self.config_body = list(imports.values()) + self.config_body + self.config_body = list(imports.values()) + config_imports + self.config_body dependency_imports.update(imports) new_body = list(dependency_imports.values()) if len(self.new_body.keys()) > 0: From 799ceb3c5321e1d45c170350cf4a6fa317233c4d Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 10:30:24 +0200 Subject: [PATCH 05/16] tiny fix --- src/transformers/models/auto/configuration_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index efca5cc8187a37..f803687d43cbf4 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -403,7 +403,7 @@ ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), - ("instructblipvideo", "InstructBlipVideoVideo"), + ("instructblipvideo", "InstructBlipVideo"), ("jamba", "Jamba"), ("jetmoe", "JetMoe"), ("jukebox", "Jukebox"), From 6fd098fd45efd2b6dbde5482d77c83a5ff820cc7 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 10:42:31 +0200 Subject: [PATCH 06/16] make docs happy --- docs/source/en/index.md | 2 +- docs/source/en/model_doc/instructblipvideo.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index a3f1fce954ed5d..39c46f5c0fcedd 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -165,7 +165,7 @@ Flax), PyTorch, and/or TensorFlow. | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | -| [InstructBlipVideo](model_doc/instructblipvideo) | ✅ | ❌ | ❌ | +| [InstructBlipVideo](model_doc/instructblipvideo) | ✅ | ❌ | ❌ | | [Jamba](model_doc/jamba) | ✅ | ❌ | ❌ | | [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ | | [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index d28dea5c10acb5..aa93feb6b6dced 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -48,13 +48,13 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m [[autodoc]] InstructBlipVideoQFormerConfig -## InstructBlipProcessor +## InstructBlipVideoProcessor -[[autodoc]] InstructBlipProcessor +[[autodoc]] InstructBlipVideoProcessor -## InstructBlipImageProcessor +## InstructBlipVideoImageProcessor -[[autodoc]] InstructBlipImageProcessor +[[autodoc]] InstructBlipVideoImageProcessor - preprocess ## InstructBlipVideoVisionModel From 04cf107e3f850c485adc36cae31356d96bfa3aba Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 12:30:26 +0200 Subject: [PATCH 07/16] skip test --- utils/check_repo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check_repo.py b/utils/check_repo.py index dcb1374d8e0b9b..4eeee03d69d988 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -90,6 +90,7 @@ "RecurrentGemmaModel", # Building part of bigger (tested) model. "FuyuForCausalLM", # Not tested fort now "InstructBlipQFormerModel", # Building part of bigger (tested) model. + "InstructBlipVideoQFormerModel", # Building part of bigger (tested) model. "UMT5EncoderModel", # Building part of bigger (tested) model. "Blip2QFormerModel", # Building part of bigger (tested) model. "ErnieMForInformationExtraction", From 2c44b611aef9b2de271f66d68982ee48649ad89a Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 12 Jun 2024 12:52:52 +0200 Subject: [PATCH 08/16] repo consistency tests --- src/transformers/models/auto/modeling_auto.py | 1 + utils/check_repo.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index adfcc7af9fbc88..5021517147483b 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -696,6 +696,7 @@ ("git", "GitForCausalLM"), ("idefics2", "Idefics2ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), + ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), diff --git a/utils/check_repo.py b/utils/check_repo.py index 4eeee03d69d988..b16e0327b6d286 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -246,6 +246,8 @@ "GPTSw3DoubleHeadsModel", "InstructBlipVisionModel", "InstructBlipQFormerModel", + "InstructBlipVideoVisionModel", + "InstructBlipVideoQFormerModel", "LayoutLMForQuestionAnswering", "LukeForMaskedLM", "LukeForEntityClassification", From a1287a7e0abac15d3f9192a6213400e4367e8182 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 14 Jun 2024 11:10:42 +0200 Subject: [PATCH 09/16] update docstring --- .../diff_instructblipvideo.py | 39 +++++++++++++------ .../modeling_instructblipvideo.py | 37 +++++++++++++----- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py index 6891395a710ae9..beefab308f1649 100644 --- a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py @@ -158,7 +158,27 @@ def forward( >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration >>> import torch >>> from huggingface_hub import hf_hub_download - >>> from decord import VideoReader + >>> from av + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`List[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") @@ -166,25 +186,22 @@ def forward( >>> file_path = hf_hub_download( repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" ) - - >>> vr = VideoReader(uri=file_path, height=224, width=224) - >>> start, end = 0, len(vr) - >>> indices = np.arange(start, end, vlen / 4).astype(int) - >>> frames = vr.get_batch(indices).asnumpy() - >>> video = list(frames) + >>> container = av.open(video_path) + >>> # sample uniformly 4 frames from the videWhy is this video funny?o + >>> total_frames = container.streams.video[0].frames + >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int) + >>> clip = read_video_pyav(container, indices) + >>> prompt = "What is happening in the video?" - >>> inputs = processor(videos=[video], text=prompt, return_tensors="pt").to(device) + >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device) >>> outputs = model.generate( ... **inputs, ... do_sample=False, ... num_beams=5, ... max_length=256, - ... min_length=1, - ... top_p=0.9, ... repetition_penalty=1.5, ... length_penalty=1.0, - ... temperature=1, ... ) >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() >>> print(generated_text) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 08a3a1b03cbd1a..06edf0d9332125 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1393,7 +1393,27 @@ def forward( >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration >>> import torch >>> from huggingface_hub import hf_hub_download - >>> from decord import VideoReader + >>> from av + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`List[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") @@ -1401,25 +1421,22 @@ def forward( >>> file_path = hf_hub_download( repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" ) + >>> container = av.open(video_path) + >>> # sample uniformly 4 frames from the videWhy is this video funny?o + >>> total_frames = container.streams.video[0].frames + >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int) + >>> clip = read_video_pyav(container, indices) - >>> vr = VideoReader(uri=file_path, height=224, width=224) - >>> start, end = 0, len(vr) - >>> indices = np.arange(start, end, vlen / 4).astype(int) - >>> frames = vr.get_batch(indices).asnumpy() - >>> video = list(frames) >>> prompt = "What is happening in the video?" - >>> inputs = processor(videos=[video], text=prompt, return_tensors="pt").to(device) + >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device) >>> outputs = model.generate( ... **inputs, ... do_sample=False, ... num_beams=5, ... max_length=256, - ... min_length=1, - ... top_p=0.9, ... repetition_penalty=1.5, ... length_penalty=1.0, - ... temperature=1, ... ) >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() >>> print(generated_text) From fc8643d500bbe4c6f6e84205efe8d5988c942029 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 17 Jun 2024 09:49:38 +0200 Subject: [PATCH 10/16] style --- .../models/instructblipvideo/diff_instructblipvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py index beefab308f1649..08bc960e041d59 100644 --- a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py @@ -191,7 +191,7 @@ def forward( >>> total_frames = container.streams.video[0].frames >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int) >>> clip = read_video_pyav(container, indices) - + >>> prompt = "What is happening in the video?" >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device) From 134a206022408e880c3a6d9f93c8e8bfa1cd56c5 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 17 Jun 2024 10:17:24 +0200 Subject: [PATCH 11/16] fix tests --- .../test_image_processing_instrictblipvideo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py index df29234c7ff34d..d53342416d286d 100644 --- a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py +++ b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py @@ -76,7 +76,7 @@ def prepare_image_processor_dict(self): } def expected_output_image_shape(self, images): - return self.frames, self.num_channels, self.crop_size["height"], self.crop_size["width"] + return self.frames, self.num_channels, self.size["height"], self.size["width"] def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): images = prepare_image_inputs( @@ -112,6 +112,7 @@ class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCas image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None def setUp(self): + super().setUp() self.image_processor_tester = InstructBlipVideoProcessingTester(self) @property From cf71681bb78dea0cf2c238a0735c9564295a5d49 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 20 Jun 2024 08:22:23 +0200 Subject: [PATCH 12/16] change diff imports --- .../configuration_instructblipvideo.py | 4 +++- utils/diff_model_converter.py | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index 32c4a34cf7a632..180372f35d18fe 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -24,7 +24,9 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES -from ...utils import logging +from ...utils import ( + logging, +) from ..auto import CONFIG_MAPPING diff --git a/utils/diff_model_converter.py b/utils/diff_model_converter.py index d8efb0dc66b321..7cef41751db90a 100644 --- a/utils/diff_model_converter.py +++ b/utils/diff_model_converter.py @@ -370,7 +370,6 @@ def __init__(self, python_module, new_name, given_old_name=None, given_new_name= self.transformers_imports = {} # maps the imports name like "from transformers.models.xxx" to the parsed AST module self.imported_mapping = {} # stores the name of the imported classes, with their source {"LlamaModel":"transformers.model.llama.modeling_llama"} self.visited_module = {} # modules visited like "transformers.models.llama.modeling_llama" - self.visited_module_config = {} # modules visited like "transformers.models.llama.modeling_llama" in config file, needed to not mix config vs modeling imports self.new_body = {} # store the new body, all global scope nodes should be added here self.inserted_deps = [] # nodes inserted via super dependency self.all_imports = [] # just stores all of the imports @@ -463,7 +462,7 @@ def leave_ClassDef(self, original_node, updated_node): f"Tried parsing the name of the imported package from {super_file_name}, could not extract the model name" ) - visited_module = self.visited_module_config if "Config" in class_name else self.visited_module + visited_module = self.visited_module if super_file_name not in visited_module: # only extract classes once class_finder = find_classes_in_file( self.transformers_imports[super_file_name], @@ -517,8 +516,19 @@ def leave_Module(self, original_node: cst.Assign, node): config_imports = [] for visiter in self.visited_module.values(): dependency_imports.update({self.python_module.code_for_node(k): k for k in visiter.imports.values()}) - for visiter in self.visited_module_config.values(): - config_imports += list(visiter.imports.values()) + + # manually clean up if it's importing a config from configuration file (ruff doesn't do that) + config_imports = [] + for i in list(dependency_imports.values()): + if ( + hasattr(i.body[0], "module") + and isinstance(i.body[0].module, cst.Name) + and f"configuration_{self.model_name}" in i.body[0].module.value + ): + pass + else: + config_imports.append(i) + if hasattr(self, "config_body"): self.config_body = list(imports.values()) + config_imports + self.config_body dependency_imports.update(imports) From c14fe5d225fb6f2b554f76cbba70284c280b385e Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Jun 2024 06:16:20 +0200 Subject: [PATCH 13/16] [run-slow] instructblipvideo From 3ab15a86ee8cd458b3ea8ffc00388efe278252b8 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Jun 2024 14:42:19 +0200 Subject: [PATCH 14/16] [run-slow] instructblipvideo From 94acdee54007f11df58fdf1ca36d56db916549ff Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 25 Jun 2024 11:24:20 +0200 Subject: [PATCH 15/16] fix tests and remove logit check --- .../models/instructblip/modeling_instructblip.py | 2 +- .../modeling_instructblipvideo.py | 2 +- .../test_modeling_instructblipvideo.py | 14 ++------------ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 386b69cd3b0fca..8ad47b308fd0b6 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -324,7 +324,7 @@ def _init_weights(self, module): module.bias.data.zero_() if isinstance(module, InstructBlipVisionEmbeddings): - if hasattr(self.config, "vision_config"): + if hasattr(self.config, "vision_config") and not isinstance(self.config, InstructBlipVisionConfig): factor = self.config.vision_config.initializer_range nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 06edf0d9332125..4bd97249efd66f 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -332,7 +332,7 @@ def _init_weights(self, module): module.bias.data.zero_() if isinstance(module, InstructBlipVideoVisionEmbeddings): - if hasattr(self.config, "vision_config"): + if hasattr(self.config, "vision_config") and not isinstance(self.config, InstructBlipVideoVisionConfig): factor = self.config.vision_config.initializer_range nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 6c0d93ff554917..1265db3a2a2e72 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -234,7 +234,7 @@ def test_save_load_fast_init_to_base(self): @slow def test_model_from_pretrained(self): - model_name = "Salesforce" + model_name = "Salesforce/instructblip-vicuna-7b" model = InstructBlipVideoVisionModel.from_pretrained(model_name) self.assertIsNotNone(model) @@ -546,7 +546,7 @@ def test_load_vision_qformer_text_config(self): @slow def test_model_from_pretrained(self): - model_name = "Salesforce" + model_name = "Salesforce/instructblip-vicuna-7b" model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) @@ -576,16 +576,6 @@ def test_inference_vicuna_7b(self): prompt = "Explain what is happening in this short video." inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, torch.float16) - # verify logits - with torch.no_grad(): - logits = model(**inputs).logits - - expected_slice = torch.tensor( - [[-3.3203, -11.7266, 9.7266], [-5.3242, -12.8125, 10.6328], [-3.7109, -13.2422, 10.3516]], - device=torch_device, - ) - self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3)) - # verify generation outputs = model.generate(**inputs, max_new_tokens=30) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() From 2a9b6d98c30fcb5bf7b955d9bf8a8870057c23a8 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 25 Jun 2024 12:05:39 +0200 Subject: [PATCH 16/16] [run-slow] instructblipvideo --- src/transformers/models/blip_2/modeling_blip_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 8fa55d01ee8859..7aad5bea66ca11 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -317,7 +317,7 @@ def _init_weights(self, module): module.bias.data.zero_() if isinstance(module, Blip2VisionEmbeddings): - if hasattr(self.config, "vision_config"): + if hasattr(self.config, "vision_config") and not isinstance(self.config, Blip2VisionConfig): factor = self.config.vision_config.initializer_range nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)