squash in single commit

huggingface · Jun 11, 2024 · acf0885 · acf0885
1 parent ce3647a
commit acf0885
Show file tree

Hide file tree

Showing 22 changed files with 4,111 additions and 18 deletions.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -774,6 +774,8 @@
         title: Idefics2
       - local: model_doc/instructblip
         title: InstructBLIP
+      - local: model_doc/instructblipvideo
+        title: InstructBlipVideo
       - local: model_doc/kosmos-2
         title: KOSMOS-2
       - local: model_doc/layoutlm

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -165,6 +165,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
 |                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
 |                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
+|          [InstructBlipVideo](model_doc/instructblipvideo)           |       ✅        |         ❌         |      ❌      |
 |                         [Jamba](model_doc/jamba)                         |       ✅        |         ❌         |      ❌      |
 |                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
 |                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
@@ -50,6 +50,7 @@ InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but impor
 
 [[autodoc]] InstructBlipProcessor
 
+
 ## InstructBlipVisionModel
 
 [[autodoc]] InstructBlipVisionModel

diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md
@@ -0,0 +1,59 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# InstructBlipVideo
+
+## Overview
+
+The InstructBlipVideo model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## InstructBlipVideoConfig
+
+[[autodoc]] InstructBlipVideoConfig
+    - from_vision_qformer_text_configs
+
+## InstructBlipVideoVisionConfig
+
+[[autodoc]] InstructBlipVideoVisionConfig
+
+## InstructBlipVideoQFormerConfig
+
+[[autodoc]] InstructBlipVideoQFormerConfig
+
+## InstructBlipVideoVisionModel
+
+[[autodoc]] InstructBlipVideoVisionModel
+    - forward
+
+## InstructBlipVideoQFormerModel
+
+[[autodoc]] InstructBlipVideoQFormerModel
+    - forward
+
+## InstructBlipVideoForConditionalGeneration
+
+[[autodoc]] InstructBlipVideoForConditionalGeneration
+    - forward
+    - generate
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -473,6 +473,12 @@
         "InstructBlipQFormerConfig",
         "InstructBlipVisionConfig",
     ],
+    "models.instructblipvideo": [
+        "InstructBlipVideoConfig",
+        "InstructBlipVideoProcessor",
+        "InstructBlipVideoQFormerConfig",
+        "InstructBlipVideoVisionConfig",
+    ],
     "models.jamba": ["JambaConfig"],
     "models.jetmoe": ["JetMoeConfig"],
     "models.kosmos2": [
@@ -1135,6 +1141,7 @@
     _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
     _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
     _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
+    _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
     _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
     _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
     _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
@@ -2303,6 +2310,14 @@
             "InstructBlipVisionModel",
         ]
     )
+    _import_structure["models.instructblipvideo"].extend(
+        [
+            "InstructBlipVideoForConditionalGeneration",
+            "InstructBlipVideoPreTrainedModel",
+            "InstructBlipVideoQFormerModel",
+            "InstructBlipVideoVisionModel",
+        ]
+    )
     _import_structure["models.jamba"].extend(
         [
             "JambaForCausalLM",
@@ -5055,6 +5070,12 @@
         InstructBlipQFormerConfig,
         InstructBlipVisionConfig,
     )
+    from .models.instructblipvideo import (
+        InstructBlipVideoConfig,
+        InstructBlipVideoProcessor,
+        InstructBlipVideoQFormerConfig,
+        InstructBlipVideoVisionConfig,
+    )
     from .models.jamba import JambaConfig
     from .models.jetmoe import JetMoeConfig
     from .models.kosmos2 import (
@@ -5743,6 +5764,7 @@
         from .models.idefics import IdeficsImageProcessor
         from .models.idefics2 import Idefics2ImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
+        from .models.instructblipvideo import InstructBlipVideoImageProcessor
         from .models.layoutlmv2 import (
             LayoutLMv2FeatureExtractor,
             LayoutLMv2ImageProcessor,
@@ -6732,6 +6754,12 @@
             InstructBlipQFormerModel,
             InstructBlipVisionModel,
         )
+        from .models.instructblipvideo import (
+            InstructBlipVideoForConditionalGeneration,
+            InstructBlipVideoPreTrainedModel,
+            InstructBlipVideoQFormerModel,
+            InstructBlipVideoVisionModel,
+        )
         from .models.jamba import (
             JambaForCausalLM,
             JambaForSequenceClassification,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -111,6 +111,7 @@
     imagegpt,
     informer,
     instructblip,
+    instructblipvideo,
     jamba,
     jetmoe,
     kosmos2,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -128,6 +128,7 @@
         ("imagegpt", "ImageGPTConfig"),
         ("informer", "InformerConfig"),
         ("instructblip", "InstructBlipConfig"),
+        ("instructblipvideo", "InstructBlipVideoConfig"),
         ("jamba", "JambaConfig"),
         ("jetmoe", "JetMoeConfig"),
         ("jukebox", "JukeboxConfig"),
@@ -402,6 +403,7 @@
         ("imagegpt", "ImageGPT"),
         ("informer", "Informer"),
         ("instructblip", "InstructBLIP"),
+        ("instructblipvideo", "InstructBlipVideoVideo"),
         ("jamba", "Jamba"),
         ("jetmoe", "JetMoe"),
         ("jukebox", "Jukebox"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -75,6 +75,7 @@
         ("idefics2", "Idefics2ImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
+        ("instructblipvideo", "InstructBlipVideoImageProcessor"),
         ("kosmos-2", "CLIPImageProcessor"),
         ("layoutlmv2", "LayoutLMv2ImageProcessor"),
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -64,6 +64,7 @@
         ("idefics", "IdeficsProcessor"),
         ("idefics2", "Idefics2Processor"),
         ("instructblip", "InstructBlipProcessor"),
+        ("instructblipvideo", "InstructBlipVideoProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -205,6 +205,7 @@
             ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             (
                 "jamba",
                 (

diff --git a/src/transformers/models/instructblipvideo/__init__.py b/src/transformers/models/instructblipvideo/__init__.py
@@ -0,0 +1,83 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_instructblipvideo": [
+        "InstructBlipVideoConfig",
+        "InstructBlipVideoQFormerConfig",
+        "InstructBlipVideoVisionConfig",
+    ],
+    "processing_instructblipvideo": ["InstructBlipVideoProcessor"],
+}
+
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_instructblipvideo"] = ["InstructBlipVideoImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_instructblipvideo"] = [
+        "InstructBlipVideoQFormerModel",
+        "InstructBlipVideoPreTrainedModel",
+        "InstructBlipVideoForConditionalGeneration",
+        "InstructBlipVideoVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_instructblipvideo import (
+        InstructBlipVideoConfig,
+        InstructBlipVideoQFormerConfig,
+        InstructBlipVideoVisionConfig,
+    )
+    from .processing_instructblipvideo import InstructBlipVideoProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_instructblipvideo import InstructBlipVideoImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_instructblipvideo import (
+            InstructBlipVideoForConditionalGeneration,
+            InstructBlipVideoPreTrainedModel,
+            InstructBlipVideoQFormerModel,
+            InstructBlipVideoVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -50,6 +50,7 @@ InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but impor

		[[autodoc]] InstructBlipProcessor


		## InstructBlipVisionModel

		[[autodoc]] InstructBlipVisionModel
Expand Down